def cifar_demo():
    """Performs a demo classification on cifar
    """
    mpi.mkdir(FLAGS.output_dir)
    logging.info('Loading cifar data...')
    cifar = visiondata.CifarDataset(FLAGS.root, is_training=True)
    cifar_test = visiondata.CifarDataset(FLAGS.root, is_training=False)
    conv = pipeline.ConvLayer([
            pipeline.PatchExtractor([6,6], 1), # extracts patches
            pipeline.MeanvarNormalizer({'reg': 10}), # normalizes the patches
            pipeline.LinearEncoder({},
                    trainer = pipeline.ZcaTrainer({'reg': 0.1})), # Does whitening
        pipeline.ThresholdEncoder({'alpha': 0.25, 'twoside': True},
                    trainer = pipeline.OMPTrainer(
                            {'k': 800, 'max_iter':100})), # does encoding
            pipeline.SpatialPooler({'grid': (2,2), 'method': 'ave'}) # average pool
            ])
    logging.info('Training the pipeline...')
    conv.train(cifar, 50000)
    logging.info('Dumping the pipeline...')
    if mpi.is_root():
        with open(os.path.join(FLAGS.output_dir, FLAGS.model_file),'w') as fid:
            pickle.dump(conv, fid)
            fid.close()
    with open(os.path.join(FLAGS.output_dir, FLAGS.model_file),'r') as fid:
        conv = pickle.load(fid)
    logging.info('Extracting features...')
    Xtrain = conv.process_dataset(cifar, as_2d = True)
    mpi.dump_matrix_multi(Xtrain,
                          os.path.join(FLAGS.output_dir, 
                                       FLAGS.feature_file+'_train'))
    Ytrain = cifar.labels().astype(np.int)
    Xtest = conv.process_dataset(cifar_test, as_2d = True)
    mpi.dump_matrix_multi(Xtest,
                          os.path.join(FLAGS.output_dir, 
                                       FLAGS.feature_file+'_test'))
    Ytest = cifar_test.labels().astype(np.int)

    # normalization
    m, std = classifier.feature_meanstd(Xtrain)
    Xtrain -= m
    Xtrain /= std
    Xtest -= m
    Xtest /= std
    
    w, b = classifier.l2svm_onevsall(Xtrain, Ytrain, 0.01)
    if mpi.is_root():
        with open(os.path.join(FLAGS.output_dir, FLAGS.svm_file), 'w') as fid:
            pickle.dump({'m': m, 'std': std, 'w': w, 'b': b}, fid)
    accu = np.sum(Ytrain == (np.dot(Xtrain,w)+b).argmax(axis=1)) \
            / float(len(Ytrain))
    accu_test = np.sum(Ytest == (np.dot(Xtest,w)+b).argmax(axis=1)) \
            / float(len(Ytest))
    
    logging.info('Training accuracy: %f' % accu)
    logging.info('Testing accuracy: %f' % accu_test)
def cifar_demo():
    """Performs a demo classification on cifar
    """
    mpi.mkdir(FLAGS.output_dir)
    logging.info("Loading cifar data...")
    cifar = visiondata.CifarDataset(FLAGS.root, is_training=True)
    cifar_test = visiondata.CifarDataset(FLAGS.root, is_training=False)

    # try: use sub images
    # cifar = datasets.SubImageSet(cifar, [28,28], 1)
    # cifar_test = datasets.CenterRegionSet(cifar_test, [28,28])

    conv = pipeline.ConvLayer(
        [
            pipeline.PatchExtractor([6, 6], 1),  # extracts patches
            pipeline.MeanvarNormalizer({"reg": 10}),  # normalizes the patches
            pipeline.LinearEncoder({}, trainer=pipeline.ZcaTrainer({"reg": 0.1})),  # Does whitening
            pipeline.ThresholdEncoder(
                {"alpha": 0.25, "twoside": True}, trainer=pipeline.OMPTrainer({"k": 1600, "max_iter": 100})
            ),  # does encoding
            pipeline.SpatialPooler({"grid": (4, 4), "method": "max"}),  # average pool
        ]
    )
    logging.info("Training the pipeline...")
    conv.train(cifar, 400000)
    logging.info("Dumping the pipeline...")
    if mpi.is_root():
        with open(os.path.join(FLAGS.output_dir, FLAGS.model_file), "w") as fid:
            pickle.dump(conv, fid)
            fid.close()
    logging.info("Extracting features...")
    Xtrain = conv.process_dataset(cifar, as_2d=True)
    mpi.dump_matrix_multi(Xtrain, os.path.join(FLAGS.output_dir, FLAGS.feature_file + "_train"))
    Ytrain = cifar.labels().astype(np.int)
    Xtest = conv.process_dataset(cifar_test, as_2d=True)
    mpi.dump_matrix_multi(Xtest, os.path.join(FLAGS.output_dir, FLAGS.feature_file + "_test"))
    Ytest = cifar_test.labels().astype(np.int)
    # normalization
    m, std = classifier.feature_meanstd(Xtrain)
    Xtrain -= m
    Xtrain /= std
    Xtest -= m
    Xtest /= std

    w, b = classifier.l2svm_onevsall(Xtrain, Ytrain, 0.005)
    if mpi.is_root():
        with open(os.path.join(FLAGS.output_dir, FLAGS.svm_file), "w") as fid:
            pickle.dump({"m": m, "std": std, "w": w, "b": b}, fid)
    accu = np.sum(Ytrain == (np.dot(Xtrain, w) + b).argmax(axis=1)) / float(len(Ytrain))
    accu_test = np.sum(Ytest == (np.dot(Xtest, w) + b).argmax(axis=1)) / float(len(Ytest))

    logging.info("Training accuracy: %f" % accu)
    logging.info("Testing accuracy: %f" % accu_test)
def omp_n(X, k, num_active, max_iter=100, tol=1e-4):
    """OMP training with MPI
    
    Input:
        X: a num_data_local * dim numpy matrix containing the data, each row
            being a datum.
        k: the dictionary size.
        num_active: the number of active dictionary entries for each datum
        max_iter: (optional) the maximum number of iteration. Default 100.
        tol: (optional) the tolerance threshold to determine convergence.
            Default 1e-4.
    """
    # vdata is used for testing convergence
    Nlocal = X.shape[0]
    vdatalocal = np.sum(np.var(X, 0))
    N = mpi.COMM.allreduce(Nlocal)
    vdata = mpi.COMM.allreduce(vdatalocal)
    vdata /= N
    # random initialization
    centroids = np.random.randn(k, X.shape[1])
    centroids /= np.sqrt(np.sum(centroids ** 2, axis=1)).reshape(k, 1)
    centroids_all = mpi.COMM.gather(centroids)
    # make sure we are using the same centroids on all nodes
    if mpi.is_root():
        centroids_all = np.vstack(centroids_all)
        centroids[:] = centroids_all[np.random.permutation(centroids_all.shape[0])[:k]]
    mpi.COMM.Bcast(centroids, root=0)

    timer = util.Timer()
    for iter_id in range(max_iter):
        logging.debug(
            "OMP-%d iter %d, last iteration %s, elapsed %s" % (num_active, iter_id, timer.lap(), timer.total())
        )
        centroids_old = centroids.copy()
        labels, val = omp_n_predict(X, centroids, num_active)
        centroids = omp_n_maximize(X, labels, val, k)
        # check convergence on root
        if mpi.is_root():
            converged = np.sum((centroids_old - centroids) ** 2) < tol * vdata
        else:
            converged = None
        converged = mpi.COMM.bcast(converged)
        if converged:
            logging.debug("OMP has converged.")
            break
    else:
        logging.debug("OMP reached the maximum number of iterations.")
    return centroids
Beispiel #4
0
def omp_n(X, k, num_active, max_iter=100, tol=1e-4):
    '''OMP training with MPI
    
    Input:
        X: a num_data_local * dim numpy matrix containing the data, each row
            being a datum.
        k: the dictionary size.
        num_active: the number of active dictionary entries for each datum
        max_iter: (optional) the maximum number of iteration. Default 100.
        tol: (optional) the tolerance threshold to determine convergence.
            Default 1e-4.
    '''
    # vdata is used for testing convergence
    Nlocal = X.shape[0]
    vdatalocal = np.sum(np.var(X, 0))
    N = mpi.COMM.allreduce(Nlocal)
    vdata = mpi.COMM.allreduce(vdatalocal)
    vdata /= N
    # random initialization
    centroids = np.random.randn(k, X.shape[1])
    centroids /= np.sqrt(np.sum(centroids**2, axis=1)).reshape(k, 1)
    centroids_all = mpi.COMM.gather(centroids)
    # make sure we are using the same centroids on all nodes
    if mpi.is_root():
        centroids_all = np.vstack(centroids_all)
        centroids[:] = centroids_all[\
                np.random.permutation(centroids_all.shape[0])[:k]]
    mpi.COMM.Bcast(centroids, root=0)

    timer = util.Timer()
    for iter_id in range(max_iter):
        logging.debug("OMP-%d iter %d, last iteration %s, elapsed %s" % \
                      (num_active, iter_id, timer.lap(), timer.total()))
        centroids_old = centroids.copy()
        labels, val = omp_n_predict(X, centroids, num_active)
        centroids = omp_n_maximize(X, labels, val, k)
        # check convergence on root
        if mpi.is_root():
            converged = np.sum((centroids_old - centroids)**2) < tol * vdata
        else:
            converged = None
        converged = mpi.COMM.bcast(converged)
        if converged:
            logging.debug("OMP has converged.")
            break
    else:
        logging.debug("OMP reached the maximum number of iterations.")
    return centroids
Beispiel #5
0
def compute_caltech_features():
    caltech = datasets.TwoLayerDataset(FLAGS.root, ['jpg'], max_size=300)
    conv = pipeline.ConvLayer([
        dsift.DsiftExtractor(FLAGS.sift_size, FLAGS.sift_stride),
        pipeline.LLCEncoder({'k': FLAGS.llc_k},
                            trainer=pipeline.KmeansTrainer(
                                {'k': FLAGS.dict_size})),
        pipeline.PyramidPooler({
            'level': 3,
            'method': 'max'
        })
    ])
    conv.train(caltech, 400000)
    feat = conv.process_dataset(caltech, as_2d=True)

    mpi.mkdir(FLAGS.feature_dir)
    if mpi.is_root():
        with (open(os.path.join(FLAGS.feature_dir, FLAGS.model_file),
                   'w')) as fid:
            pickle.dump(conv, fid)

    mpi.dump_matrix_multi(feat,
                          os.path.join(FLAGS.feature_dir, FLAGS.feature_file))
    mpi.dump_matrix_multi(caltech.labels(),
                          os.path.join(FLAGS.feature_dir, FLAGS.label_file))
Beispiel #6
0
 def __init__(self, rootfolder, is_training):
     super(MNISTDataset, self).__init__()
     if mpi.is_root():
         # root loads the data
         if is_training:
             self._data = self._read_byte_data(
                     os.path.join(rootfolder,'train-images-idx3-ubyte'),
                     16, (MNISTDataset.__num_train,) + \
                             MNISTDataset.__image_dim)
             self._label = self._read_byte_data(
                 os.path.join(rootfolder, 'train-labels-idx1-ubyte'), 8,
                 [MNISTDataset.__num_train]).astype(np.int)
         else:
             self._data = self._read_byte_data(
                     os.path.join(rootfolder,'t10k-images-idx3-ubyte'),
                     16, (MNISTDataset.__num_test,) + \
                             MNISTDataset.__image_dim)
             self._label = self._read_byte_data(
                 os.path.join(rootfolder, 't10k-labels-idx1-ubyte'), 8,
                 [MNISTDataset.__num_test]).astype(np.int)
     else:
         self._data = None
         self._label = None
     self._data = mpi.distribute(self._data)
     self._label = mpi.distribute(self._label)
     self._dim = MNISTDataset.__image_dim
     self._channels = 1
 def __init__(self, rootfolder, is_training):
     super(MNISTDataset, self).__init__()
     if mpi.is_root():
         # root loads the data
         if is_training:
             self._data = self._read_byte_data(
                     os.path.join(rootfolder,'train-images-idx3-ubyte'), 
                     16, (MNISTDataset.__num_train,) + \
                             MNISTDataset.__image_dim)
             self._label = self._read_byte_data(
                     os.path.join(rootfolder,'train-labels-idx1-ubyte'),
                     8, [MNISTDataset.__num_train]).astype(np.int)
         else:
             self._data = self._read_byte_data(
                     os.path.join(rootfolder,'t10k-images-idx3-ubyte'),
                     16, (MNISTDataset.__num_test,) + \
                             MNISTDataset.__image_dim)
             self._label = self._read_byte_data(
                     os.path.join(rootfolder,'t10k-labels-idx1-ubyte'),
                     8, [MNISTDataset.__num_test]).astype(np.int)
     else:
         self._data = None
         self._label = None
     self._data = mpi.distribute(self._data)
     self._label = mpi.distribute(self._label)
     self._dim = MNISTDataset.__image_dim
     self._channels = 1
 def load_cifar10(self, rootfolder, is_training):
     """loads the cifar-10 dataset
     """
     if mpi.is_root():
         if is_training:
             self._data = np.empty((CifarDataset.__num_train,) + \
                                   CifarDataset.__image_dim)
             self._label = np.empty(CifarDataset.__num_train)
             # training batches
             for i in range(CifarDataset.__num_batches):
                 with open(os.path.join(rootfolder,
                         'data_batch_{0}'.format(i+1)),'r') as fid:
                     batch = pickle.load(fid)
                 start_idx = CifarDataset.__batchsize * i
                 end_idx = CifarDataset.__batchsize * (i+1)
                 self._data[start_idx:end_idx] = \
                         CifarDataset.get_images_from_matrix(batch['data'])
                 self._label[start_idx:end_idx] = np.array(batch['labels'])
         else:
             with open(os.path.join(rootfolder, 'test_batch'), 'r') as fid:
                 batch = pickle.load(fid)
             self._data = CifarDataset.get_images_from_matrix(batch['data'])
             self._label = np.array(batch['labels'])
     else:
         self._data = None
         self._label = None
     self._data = mpi.distribute(self._data)
     self._label = mpi.distribute(self._label)
Beispiel #9
0
 def load_cifar10(self, rootfolder, is_training):
     """loads the cifar-10 dataset
     """
     if mpi.is_root():
         if is_training:
             self._data = np.empty((CifarDataset.__num_train,) + \
                                   CifarDataset.__image_dim)
             self._label = np.empty(CifarDataset.__num_train)
             # training batches
             for i in range(CifarDataset.__num_batches):
                 with open(
                         os.path.join(rootfolder,
                                      'data_batch_{0}'.format(i + 1)),
                         'r') as fid:
                     batch = pickle.load(fid)
                 start_idx = CifarDataset.__batchsize * i
                 end_idx = CifarDataset.__batchsize * (i + 1)
                 self._data[start_idx:end_idx] = \
                         CifarDataset.get_images_from_matrix(batch['data'])
                 self._label[start_idx:end_idx] = np.array(batch['labels'])
         else:
             with open(os.path.join(rootfolder, 'test_batch'), 'r') as fid:
                 batch = pickle.load(fid)
             self._data = CifarDataset.get_images_from_matrix(batch['data'])
             self._label = np.array(batch['labels'])
     else:
         self._data = None
         self._label = None
     self._data = mpi.distribute(self._data)
     self._label = mpi.distribute(self._label)
Beispiel #10
0
def get_predictions_logreg_perclass(X, weights):
    pred = mathutil.dot(X,weights[0])+weights[1]
    prob = 1.0/(1.0+np.exp(-pred))
    prob = mpi.COMM.gather(prob)
    if mpi.is_root():
        return np.vstack(prob)
    else:
        return np.zeros((0))
 def __init__(self, root_folder, extensions, prefetch = False, 
              target_size = None, max_size = None, min_size = None,
              center_crop = None):
     """ Initialize from a two-layer storage
     Input:
         root_folder: the root that contains the data. Under root_folder
             there should be a list of folders, under which there should be
             a list of files
         extensions: the list of extensions that should be used to filter the
             files. Should be like ['png', 'jpg']. It's case insensitive.
         prefetch: if True, the images are prefetched to avoid disk read. If
             you have a large number of images, prefetch would require a lot
             of memory.
         target_size, max_size, min_size, center_crop: see manipulate() for
             details.
     """
     super(TwoLayerDataset, self).__init__()
     if mpi.agree(not os.path.exists(root_folder)):
         raise OSError, "The specified folder does not exist."
     logging.debug('Loading from %s' % (root_folder,))
     if type(extensions) is str:
         extensions = [extensions]
     extensions = set(extensions)
     if mpi.is_root():
         # get files first
         files = glob.glob(os.path.join(root_folder, '*', '*'))
         # select those that fits the extension
         files = [f for f in files  if any([
                         f.lower().endswith(ext) for ext in extensions])]
         logging.debug("A total of %d images." % (len(files)))
         # get raw labels
         labels = [os.path.split(os.path.split(f)[0])[1] for f in files]
         classnames = list(set(labels))
         # sort so we get a reasonable class order
         classnames.sort()
         name2val = dict(zip(classnames, range(len(classnames))))
         labels = [name2val[label] for label in labels]
     else:
         files = None
         classnames = None
         labels = None
     mpi.barrier()
     self._rawdata = mpi.distribute_list(files)
     self._data = self._rawdata
     self._prefetch = prefetch
     self._target_size = target_size
     self._max_size = max_size
     self._min_size = min_size
     self._center_crop = center_crop
     if target_size != None:
         self._dim = tuple(target_size) + (3,)
     else:
         self._dim = False
     self._channels = 3
     if prefetch:
         self._data = [self._read(idx) for idx in range(len(self._data))]
     self._label = mpi.distribute_list(labels)
     self._classnames = mpi.COMM.bcast(classnames)
Beispiel #12
0
 def _add_default_fminargs(self):
     """
     This function adds some default args to fmin, if we have not explicitly
     specified them.
     """
     self._fminargs["maxfun"] = self._fminargs.get("maxfun", 1000)
     self._fminargs["disp"] = self._fminargs.get("disp", 1)
     # even when fmin displays outputs, we set non-root display to none
     if not mpi.is_root():
         self._fminargs["disp"] = 0
Beispiel #13
0
 def testDumpLoad(self):
     local_size = 2
     mat_sources = [np.random.rand(local_size), np.random.rand(local_size, 2), np.random.rand(local_size, 2, 3)]
     for mat in mat_sources:
         mpi.dump_matrix(mat, _MPI_DUMP_TEST_FILE)
         if mpi.is_root():
             mat_dumped = np.load(_MPI_DUMP_TEST_FILE)
             self.assertEqual(mat_dumped.shape, (local_size * mpi.SIZE,) + mat.shape[1:])
         mat_read = mpi.load_matrix(_MPI_DUMP_TEST_FILE)
         self.assertEqual(mat.shape, mat_read.shape)
Beispiel #14
0
def get_predictions_logreg(X, weights):
    pred = mathutil.dot(X,weights[0])+weights[1]
    prob = pred - pred.max(axis=1)[:,np.newaxis]
    mathutil.exp(prob, out=prob)
    prob /= prob.sum(axis=1)[:, np.newaxis]
    prob = mpi.COMM.gather(prob)
    if mpi.is_root():
        return np.vstack(prob)
    else:
        return np.zeros((0))
 def _add_default_fminargs(self):
     '''
     This function adds some default args to fmin, if we have not explicitly
     specified them.
     '''
     self._fminargs['maxfun'] = self._fminargs.get('maxfun', 1000)
     self._fminargs['disp'] = self._fminargs.get('disp', 1)
     # even when fmin displays outputs, we set non-root display to none
     if not mpi.is_root():
         self._fminargs['disp'] = 0
Beispiel #16
0
 def get_predictions_nn(self, X, special_bias=None):
     X_feats = np.ascontiguousarray(np.hstack((X[self.feat_list[i]] for i in range(len(self.feat_list)))))
     X_feats -= self.m
     X_feats /= self.std
     if special_bias != None:
         X_feats = np.ascontiguousarray(np.hstack((X_feats, special_bias)))
     prob = get_predictions_nn(X_feats, self._weights_nn, self._arch)[0]
     prob = mpi.COMM.gather(prob)
     if mpi.is_root():
         return np.vstack(prob)
     else:
         return np.zeros((0))
Beispiel #17
0
 def train(self, incoming_patches):
     m, covmat = mathutil.mpi_meancov(incoming_patches)
     if mpi.is_root():
         # only root carries out the computation
         eigval, eigvec = np.linalg.eigh(covmat)
         reg = self.specs.get('reg', np.finfo(np.float64).eps)
         W = eigvec * 1.0 / (np.sqrt(np.maximum(eigval, 0.0)) + reg)
     else:
         eigval, eigvec, W = None, None, None
     W = mpi.COMM.bcast(W)
     eigval = mpi.COMM.bcast(eigval)
     eigvec = mpi.COMM.bcast(eigvec)
     return (W, -m), (eigval, eigvec, covmat)
Beispiel #18
0
def get_predictions_nn(X, weights,arch):
    hid = mathutil.dot(X,weights[0])+weights[1]
    hid = 1.0/(1+np.exp(-hid)) #sigmoid
    pred = mathutil.dot(hid,weights[2])+weights[3]
    #prob = pred - pred.max(axis=1)[:,np.newaxis]
    #mathutil.exp(prob, out=prob)
    #prob /= prob.sum(axis=1)[:, np.newaxis]
    prob = 1.0/(1.0+np.exp(-pred))
    prob = mpi.COMM.gather(prob)
    hid = mpi.COMM.gather(hid)
    if mpi.is_root():
        return np.vstack(prob),np.vstack(hid)
    else:
        return np.zeros((0)),np.zeros((0))
Beispiel #19
0
def omp1(X, k, max_iter=100, tol=1e-4):
    '''omp1 training with MPI
    
    Note that the X matrix passed should be the local data each node is hosting.
    '''
    # vdata is used for testing convergence
    Nlocal = X.shape[0]
    vdatalocal = np.sum(np.var(X, 0))
    N = mpi.COMM.allreduce(Nlocal)
    vdata = mpi.COMM.allreduce(vdatalocal)
    vdata /= N
    # random initialization
    centroids = np.random.randn(k, X.shape[1])
    centroids /= np.sqrt(np.sum(centroids**2, axis=1)).reshape(k, 1)
    centroids_all = mpi.COMM.gather(centroids)
    # make sure we are using the same centroids on all nodes
    if mpi.is_root():
        centroids_all = np.vstack(centroids_all)
        centroids[:] = centroids_all[\
                np.random.permutation(centroids_all.shape[0])[:k]]
    mpi.COMM.Bcast(centroids, root=0)

    for iter_id in range(max_iter):
        logging.debug("OMP iteration %d" % (iter_id,))
        centroids_old = centroids.copy()
        labels, val = omp1_predict(X, centroids)
        centroids = omp1_maximize(X, labels, val, k)
        # check convergence on root
        if mpi.is_root():
            converged = np.sum((centroids_old - centroids) ** 2) < tol * vdata
        else:
            converged = None
        converged = mpi.COMM.bcast(converged)
        if converged:
            logging.debug("OMP has converged.")
            break
    return centroids
Beispiel #20
0
 def testDumpLoad(self):
     local_size = 2
     mat_sources = [
         np.random.rand(local_size),
         np.random.rand(local_size, 2),
         np.random.rand(local_size, 2, 3)
     ]
     for mat in mat_sources:
         mpi.dump_matrix(mat, _MPI_DUMP_TEST_FILE)
         if mpi.is_root():
             mat_dumped = np.load(_MPI_DUMP_TEST_FILE)
             self.assertEqual(mat_dumped.shape,
                              (local_size * mpi.SIZE, ) + mat.shape[1:])
         mat_read = mpi.load_matrix(_MPI_DUMP_TEST_FILE)
         self.assertEqual(mat.shape, mat_read.shape)
Beispiel #21
0
 def testFeatureMeanStd(self):
     mat = np.random.rand(100,50)
     m_test, std_test = classifier.feature_meanstd(mat)
     # use the naive approach to compute the mean and std
     mats = mpi.COMM.gather(mat)
     if mpi.is_root():
         mats = np.vstack(mats)
         m = mats.mean(0)
         std = mats.std(0)
     else:
         m = None
         std = None
     m = mpi.COMM.bcast(m)
     std = mpi.COMM.bcast(std)
     np.testing.assert_array_almost_equal(m, m_test)
     np.testing.assert_array_almost_equal(std, std_test)
Beispiel #22
0
 def get_data(filename):
     """This is a wrapper function that returns the images in the right
     axes order
     """
     if mpi.is_root():
         matdata = io.loadmat(filename)
         X = matdata['X'].reshape(\
                 (matdata['X'].shape[0],) + STL10Dataset._image_dim[::-1])
         # make it contiguous so we can do mpi distribute
         X = np.ascontiguousarray(np.transpose(X, axes=[0, 3, 2, 1]),
                                  dtype=X.dtype)
         Y = matdata['y'].astype(int).flatten()
     else:
         X = None
         Y = None
     return mpi.distribute(X), mpi.distribute(Y)
Beispiel #23
0
 def average_precision(Y, pred):
     """Average Precision for binary classification
     """
     # since we need to compute the precision recall curve, we have to
     # compute this on the root node.
     Y = mpi.COMM.gather(Y)
     pred = mpi.COMM.gather(pred)
     if mpi.is_root():
         Y = np.hstack(Y)
         pred = np.hstack(pred)
         precision, recall, _ = metrics.precision_recall_curve(Y == 1, pred)
         ap = metrics.auc(recall, precision)
     else:
         ap = None
     mpi.barrier()
     return mpi.COMM.bcast(ap)
Beispiel #24
0
 def get_data(filename):
     """This is a wrapper function that returns the images in the right
     axes order
     """
     if mpi.is_root():
         matdata = io.loadmat(filename)
         X = matdata['X'].reshape(\
                 (matdata['X'].shape[0],) + STL10Dataset._image_dim[::-1])
         # make it contiguous so we can do mpi distribute
         X = np.ascontiguousarray(np.transpose(X, axes=[0,3,2,1]),
                                  dtype = X.dtype)
         Y = matdata['y'].astype(int).flatten()
     else:
         X = None
         Y = None
     return mpi.distribute(X), mpi.distribute(Y)
Beispiel #25
0
 def __init__(self,
              root,
              is_training,
              crop=False,
              prefetch=False,
              target_size=None):
     """Load the dataset.
     Input:
         root: the root folder of the CUB_200_2011 dataset.
         is_training: if true, load the training data. Otherwise, load the
             testing data.
         crop: if False, does not crop the bounding box. If a real value,
             crop is the ratio of the bounding box that gets cropped.
             e.g., if crop = 1.5, the resulting image will be 1.5 * the
             bounding box area.
         prefetch: if True, the images are prefetched to avoid disk read. If
             you have a large number of images, prefetch would require a lot
             of memory.
         target_size: if provided, all images are resized to the size 
             specified. Should be a list of two integers, like [640,480].
         
     Note that we will use the python indexing (labels start from 0).
     """
     if is_training:
         mat_filename = 'train_list.mat'
     else:
         mat_filename = 'test_list.mat'
     if mpi.is_root():
         matfile = io.loadmat(os.path.join(root, mat_filename))
         labels = np.array(matfile['labels'].flatten() - 1, dtype=np.int)
         files = [f[0][0] for f in matfile['file_list']]
     else:
         labels = None
         files = None
     self._data = mpi.distribute_list(files)
     self._label = mpi.distribute(labels)
     self._root = root
     self._prefetch = prefetch
     self._crop = crop
     self._target_size = target_size
     if target_size is not None:
         self._dim = tuple(target_size) + (3, )
     else:
         self._dim = False
     self._channels = 3
     if self._prefetch:
         self._data = [self._read(i) for i in range(len(self._data))]
 def obj(wb,solver):
     '''
     The objective function used by fmin
     '''
     # obtain w and b
     K = solver._K
     dim = solver._dim
     w = wb[:K*dim].reshape((dim, K))
     b = wb[K*dim:]
     # pred is a matrix of size [num_datalocal, K]
     mathutil.dot(solver._X, w, out = solver._pred)
     solver._pred += b
     # compute the loss function
     if solver.gpredcache:
         flocal,gpred = solver.loss(solver._Y, solver._pred, solver._weight,
                                    solver._gpred, solver._gpredcache,
                                    **solver._lossargs)
     else:
         flocal,gpred = solver.loss(solver._Y, solver._pred, solver._weight,
                                    **solver._lossargs)
     mathutil.dot(solver._X.T, gpred,
                  out = solver._glocal[:K*dim].reshape(dim, K))
     solver._glocal[K*dim:] = gpred.sum(axis=0)
     # we should normalize them with the number of data
     flocal /= solver._num_data
     solver._glocal /= solver._num_data
     # add regularization term, but keep in mind that we have multiple nodes
     # so we only carry it out on root to make sure we only added one 
     # regularization term
     if mpi.is_root():
         freg, greg = solver.reg(w, **solver._regargs)
         flocal += solver._gamma * freg
         solver._glocal[:K*dim] += solver._gamma * greg.ravel()
     # do mpi reduction
     mpi.barrier()
     f = mpi.COMM.allreduce(flocal)
     mpi.COMM.Allreduce(solver._glocal, solver._g)
     ######### DEBUG PART ##############
     if np.isnan(f):
         # check all the components to see what went wrong.
         print 'rank %s: isnan X: %d' % (mpi.RANK,np.any(np.isnan(solver._X)))
         print 'rank %s: isnan Y: %d' % (mpi.RANK,np.any(np.isnan(solver._Y)))
         print 'rank %s: isnan flocal: %d' % (mpi.RANK,np.any(np.isnan(flocal)))
         print 'rank %s: isnan pred: %d' % (mpi.RANK,np.any(np.isnan(solver._pred)))
         print 'rank %s: isnan w: %d' % (mpi.RANK,np.any(np.isnan(w)))
         print 'rank %s: isnan b: %d' % (mpi.RANK,np.any(np.isnan(b)))
     return f, solver._g
 def average_precision(Y, pred):
     """Average Precision for binary classification
     """
     # since we need to compute the precision recall curve, we have to
     # compute this on the root node.
     Y = mpi.COMM.gather(Y)
     pred = mpi.COMM.gather(pred)
     if mpi.is_root():
         Y = np.hstack(Y)
         pred = np.hstack(pred)
         precision, recall, _ = metrics.precision_recall_curve(
                 Y == 1, pred)
         ap = metrics.auc(recall, precision)
     else:
         ap = None
     mpi.barrier()
     return mpi.COMM.bcast(ap)
Beispiel #28
0
 def get_predictions_nn_old(self, X, special_bias=None):
     X_feats = np.ascontiguousarray(np.hstack((X[self.feat_list[i]] for i in range(len(self.feat_list)))))
     X_feats -= self.m
     X_feats /= self.std
     if special_bias != None:
         X_feats = np.ascontiguousarray(np.hstack((X_feats, special_bias)))
     DS = ClassificationDataSet( X_feats.shape[1], 1, nb_classes=2 )
     #for i in range(X_feats.shape[0]):
     #    DS.addSample( X_feats[i,:], [0.0] )
     DS.setField('input', X_feats)
     DS.setField('target', np.zeros((X_feats.shape[0],1)))
     DS._convertToOneOfMany()
     prob = self._nn.activateOnDataset(DS)
     prob = mpi.COMM.gather(prob)
     if mpi.is_root():
         return np.vstack(prob)
     else:
         return np.zeros((0))
Beispiel #29
0
def demo_kmeans():
    """A simple kmeans demo
    """
    print 'Running kmeans demo'
    data = np.vstack((np.random.randn(500,2)+1,\
                      np.random.randn(500,2)-1))
    centers, labels, inertia = kmeans(data, 8, n_init=1, max_iter=5)
    print 'inertia =', inertia
    print 'centers = \n', centers
    try:
        from matplotlib import pyplot
        if mpi.is_root():
            pyplot.scatter(data[:, 0], data[:, 1], c=labels)
            pyplot.show()
        mpi.barrier()
    except Exception:
        print 'cannot show figure. will simply pass'
        pass
 def __init__(self, root, is_training, crop = False,
              prefetch = False, target_size = None):
     """Load the dataset.
     Input:
         root: the root folder of the CUB_200_2011 dataset.
         is_training: if true, load the training data. Otherwise, load the
             testing data.
         crop: if False, does not crop the bounding box. If a real value,
             crop is the ratio of the bounding box that gets cropped.
             e.g., if crop = 1.5, the resulting image will be 1.5 * the
             bounding box area.
         prefetch: if True, the images are prefetched to avoid disk read. If
             you have a large number of images, prefetch would require a lot
             of memory.
         target_size: if provided, all images are resized to the size 
             specified. Should be a list of two integers, like [640,480].
         
     Note that we will use the python indexing (labels start from 0).
     """
     if is_training:
         mat_filename = 'train_list.mat'
     else:
         mat_filename = 'test_list.mat'
     if mpi.is_root():
         matfile = io.loadmat(os.path.join(root, mat_filename))
         labels = np.array(matfile['labels'].flatten()-1, dtype=np.int)
         files = [f[0][0] for f in matfile['file_list']]
     else:
         labels = None
         files = None
     self._data = mpi.distribute_list(files)
     self._label = mpi.distribute(labels)
     self._root = root
     self._prefetch = prefetch
     self._crop = crop
     self._target_size = target_size
     if target_size is not None:
         self._dim = tuple(target_size) + (3,)
     else:
         self._dim = False
     self._channels = 3
     if self._prefetch:
         self._data = [self._read(i) for i in range(len(self._data))]
def compute_caltech_features():
    caltech = datasets.TwoLayerDataset(FLAGS.root, ["jpg"], max_size=300)
    conv = pipeline.ConvLayer(
        [
            dsift.DsiftExtractor(FLAGS.sift_size, FLAGS.sift_stride),
            pipeline.LLCEncoder({"k": FLAGS.llc_k}, trainer=pipeline.KmeansTrainer({"k": FLAGS.dict_size})),
            pipeline.PyramidPooler({"level": 3, "method": "max"}),
        ]
    )
    conv.train(caltech, 400000)
    feat = conv.process_dataset(caltech, as_2d=True)

    mpi.mkdir(FLAGS.feature_dir)
    if mpi.is_root():
        with (open(os.path.join(FLAGS.feature_dir, FLAGS.model_file), "w")) as fid:
            pickle.dump(conv, fid)

    mpi.dump_matrix_multi(feat, os.path.join(FLAGS.feature_dir, FLAGS.feature_file))
    mpi.dump_matrix_multi(caltech.labels(), os.path.join(FLAGS.feature_dir, FLAGS.label_file))
Beispiel #32
0
 def load_cifar100(self, rootfolder, is_training):
     """loads the cifar-100 dataset
     """
     if mpi.is_root():
         if is_training:
             filename = 'train'
         else:
             filename = 'test'
         with open(rootfolder + os.sep + filename) as fid:
             batch = pickle.load(fid)
         self._data = CifarDataset.get_images_from_matrix(batch['data'])
         self._coarselabel = np.array(batch['coarse_labels'])
         self._label = np.array(batch['fine_labels'])
     else:
         self._data = None
         self._coarselabel = None
         self._label = None
     self._data = mpi.distribute(self._data)
     self._coarselabel = mpi.distribute(self._coarselabel)
     self._label = mpi.distribute(self._label)
Beispiel #33
0
 def load_cifar100(self, rootfolder, is_training):
     """loads the cifar-100 dataset
     """
     if mpi.is_root():
         if is_training:
             filename = 'train'
         else:
             filename = 'test'
         with open(rootfolder + os.sep + filename) as fid:
             batch = pickle.load(fid)
         self._data = CifarDataset.get_images_from_matrix(batch['data'])
         self._coarselabel = np.array(batch['coarse_labels'])
         self._label = np.array(batch['fine_labels'])
     else:
         self._data = None
         self._coarselabel = None
         self._label = None
     self._data = mpi.distribute(self._data)
     self._coarselabel = mpi.distribute(self._coarselabel)
     self._label = mpi.distribute(self._label)
def demo_kmeans():
    """A simple kmeans demo
    """
    print 'Running kmeans demo'
    data = np.vstack((np.random.randn(500,2)+1,\
                      np.random.randn(500,2)-1))
    centers, labels, inertia = kmeans(data, 8, 
                                         n_init=1, 
                                         max_iter=5)
    print 'inertia =', inertia
    print 'centers = \n', centers
    try:
        from matplotlib import pyplot
        if mpi.is_root():
            pyplot.scatter(data[:,0],data[:,1],c=labels)
            pyplot.show()
        mpi.barrier()
    except Exception:
        print 'cannot show figure. will simply pass'
        pass
def prune_conv(conv, dataset, num_patches, num_features):
    if not isinstance(conv[-1], pipeline.Pooler):
        raise TypeError, "The last layer should be a pooler."
    if not isinstance(conv[-2], pipeline.FeatureEncoder):
        raise TypeError, "The second last layer should be an encoder."
    logging.debug('Randomly sampling pooled features...')
    features = conv.sample(dataset, num_patches, True)
    if features.shape[1] != conv[-2].dictionary.shape[0]:
        raise ValueError, "Huh, I can't figure out the encoding method.\n"\
                "Feature shape: %d, dictionary size: %d" % \
                (features.shape[1], conv[-2].dictionary.shape[0])
    logging.debug('Perform feature selection...')
    covmat = mathutil.mpi_cov(features)
    if mpi.is_root():
        selected_idx = max_variance_feature_selection(covmat, num_features)
    else:
        selected_idx = None
    selected_idx = mpi.COMM.bcast(selected_idx)
    conv[-2].dictionary = conv[-2].dictionary[selected_idx]
    return covmat
def prune_conv(conv, dataset, num_patches, num_features):
    if not isinstance(conv[-1], pipeline.Pooler):
        raise TypeError, "The last layer should be a pooler."
    if not isinstance(conv[-2], pipeline.FeatureEncoder):
        raise TypeError, "The second last layer should be an encoder."
    logging.debug('Randomly sampling pooled features...')
    features = conv.sample(dataset, num_patches, True)
    if features.shape[1] != conv[-2].dictionary.shape[0]:
        raise ValueError, "Huh, I can't figure out the encoding method.\n"\
                "Feature shape: %d, dictionary size: %d" % \
                (features.shape[1], conv[-2].dictionary.shape[0])
    logging.debug('Perform feature selection...')
    covmat = mathutil.mpi_cov(features)
    if mpi.is_root():
        selected_idx = max_variance_feature_selection(covmat, num_features)
    else:
        selected_idx = None
    selected_idx = mpi.COMM.bcast(selected_idx)
    conv[-2].dictionary = conv[-2].dictionary[selected_idx]
    return covmat
Beispiel #37
0
def demo_read(root):
    from iceberk import visualize
    vis = visualize.PatchVisualizer()
    print 'Loading training data...'
    traindata = STL10Dataset(root, 'train')
    print 'My training data size:', traindata.size()
    print 'Loading testing data...'
    testdata = STL10Dataset(root, 'test')
    print 'My testing data size:', testdata.size()
    print 'Loading unlabeled data...'
    unlabeleddata = STL10Dataset(root, 'unlabeled')
    print 'My unlabeled data size:', unlabeleddata.size()
    if mpi.is_root():
        vis.pyplot.figure()
        vis.show_multiple(traindata.raw_data()[:25])
        vis.pyplot.title('Sample training images.')
        vis.pyplot.figure()
        vis.show_multiple(testdata.raw_data()[:25])
        vis.pyplot.title('Sample testing images.')
        vis.pyplot.figure()
        vis.show_multiple(unlabeleddata.raw_data()[:25])
        vis.pyplot.title('Sample unlabeled images.')
        vis.pyplot.show()
    mpi.barrier()
Beispiel #38
0
def demo_read(root):
    from iceberk import visualize
    vis = visualize.PatchVisualizer()
    print 'Loading training data...'
    traindata = STL10Dataset(root, 'train')
    print 'My training data size:', traindata.size()
    print 'Loading testing data...'
    testdata = STL10Dataset(root, 'test')
    print 'My testing data size:', testdata.size()
    print 'Loading unlabeled data...'
    unlabeleddata = STL10Dataset(root, 'unlabeled')
    print 'My unlabeled data size:', unlabeleddata.size()
    if mpi.is_root():
        vis.pyplot.figure()
        vis.show_multiple(traindata.raw_data()[:25])
        vis.pyplot.title('Sample training images.')
        vis.pyplot.figure()
        vis.show_multiple(testdata.raw_data()[:25])
        vis.pyplot.title('Sample testing images.')
        vis.pyplot.figure()
        vis.show_multiple(unlabeleddata.raw_data()[:25])
        vis.pyplot.title('Sample unlabeled images.')
        vis.pyplot.show()
    mpi.barrier()
Beispiel #39
0
    def __init__(self, list_file, feat_range, posting_file, perc_pos, keep_full_utt=False, posting_sampler=None, min_dur=0.2, min_count=0.0, max_count=10000000.0, reader_type='utterance', 
                 pickle_fname=None, list_file_sph=None, kw_feat=None, merge_score_files=None):
        '''TODO: Read pieces of utterance from the CSV file instead to save memory. It would be nice to index thse by utt_id (by now I do a map).'''
        super(BabelDataset, self).__init__()
        if list_file.find('eval') >= 0:
            self.is_eval = True
            self.T = FLAGS.T_eval
        else:
            self.is_eval = False
            self.T = FLAGS.T_train
        self.beta = FLAGS.beta
        self.reader_type = reader_type
        if reader_type=='lattice':
            self.is_lattice = True
            utt_reader = LatticeReader.LatticeReader(list_file)
            utt_reader.ReadAllLatices()
        elif reader_type=='utterance':
            self.is_lattice = False
            utt_reader = UtteranceReader.UtteranceReader(list_file,pickle_fname=pickle_fname)
            utt_reader.ReadAllUtterances(feat_range)
        elif reader_type=='snr':
            self.is_lattice = False
            utt_reader = SNRReader.SNRReader(list_file,pickle_fname=pickle_fname)
            utt_reader.ReadAllSNR()
        elif reader_type=='srate':
            self.is_lattice = False
            utt_reader = SrateReader.SrateReader(list_file,pickle_fname=pickle_fname)
            utt_reader.ReadAllSrate()
        elif reader_type=='score':
            self.is_lattice = False
            utt_reader = ScoreReader.ScoreReader(list_file,list_file_sph=list_file_sph,pickle_fname=pickle_fname, merge_score_files=merge_score_files)
        else:
            print 'Reader not implemented!'
            exit(0)
        if posting_sampler == None:
            testParser = PostingParser.PostingParser(posting_file)
            self.posting_sampler = Sampler.Sampler(testParser)
            self.posting_sampler.GetPositive()
            self.posting_sampler.GetNegative()
            self.posting_sampler.SampleData(perc_pos)
        else:
            self.posting_sampler = posting_sampler
        self.min_dur = min_dur
        self._data_all = None
        self._dim = False
        self._channels = 1
        self.keep_full_utt = keep_full_utt
        if mpi.is_root():
            self._data = []
            self._label = []
            self._features = []
            self._utt_id = []
            self._times = []
            self._keyword = []
            skipped = 0
            for i in range(len(self.posting_sampler.negative_data)):
                if utt_reader.map_utt_idx.has_key(self.posting_sampler.negative_data[i]['file']):
                    if self.posting_sampler.negative_data[i]['sys_bt'] == '':
                        print 'We found a negative example that was not produced by the system!'
                        exit(0)
                    sys_bt = float(self.posting_sampler.negative_data[i]['sys_bt'])
                    sys_et = float(self.posting_sampler.negative_data[i]['sys_et'])
                    sys_sc = float(self.posting_sampler.negative_data[i]['sys_score'])
                    if(sys_et-sys_bt < self.min_dur):
                        skipped += 1
                        continue
                    self._data.append(utt_reader.GetKeywordData(self.posting_sampler.negative_data[i]['file'],
                                                              sys_bt, sys_et,kw=self.posting_sampler.negative_data[i]['termid']))
                    self._label.append(0)
                    self._features.append(sys_sc)
                    self._utt_id.append(self.posting_sampler.negative_data[i]['file'])
                    self._times.append((sys_bt,sys_et))
                    self._keyword.append(self.posting_sampler.negative_data[i]['termid'])
                else:
                    pass
            for i in range(len(self.posting_sampler.positive_data)):
                if utt_reader.map_utt_idx.has_key(self.posting_sampler.positive_data[i]['file']):
                    if self.posting_sampler.positive_data[i]['sys_bt'] == '':
                        sys_bt = 0
                        sys_et = None
                        sys_sc = -1.0
                        #print self.posting_sampler.positive_data[i]['alignment']
                        continue #Should just ignore these?
                    else:
                        sys_bt = float(self.posting_sampler.positive_data[i]['sys_bt'])
                        sys_et = float(self.posting_sampler.positive_data[i]['sys_et'])
                        sys_sc = float(self.posting_sampler.positive_data[i]['sys_score'])
                        if(sys_et-sys_bt < self.min_dur):
                            skipped += 1
                            continue
                    self._data.append(utt_reader.GetKeywordData(self.posting_sampler.positive_data[i]['file'],
                                                              sys_bt, sys_et,kw=self.posting_sampler.positive_data[i]['termid']))
                    self._label.append(1)
                    self._features.append(sys_sc)
                    self._utt_id.append(self.posting_sampler.positive_data[i]['file'])
                    self._times.append((sys_bt,sys_et))
                    self._keyword.append(self.posting_sampler.positive_data[i]['termid'])
                else:
                    pass
            
            print 'I skipped ',skipped,' entries out of ',(len(self.posting_sampler.negative_data)+len(self.posting_sampler.positive_data))
            
            self._label = np.array(self._label)
        else:
            self._data = None
            self._label = None
            self._features = None
            self._utt_id = None
            self._times = None
            self._keyword = None
        #populate true kw freq
        self._map_kw_counts = {}
        for i in range(len(self.posting_sampler.positive_data)):
            if utt_reader.map_utt_idx.has_key(self.posting_sampler.positive_data[i]['file']):
                kw = self.posting_sampler.positive_data[i]['termid']
                if self._map_kw_counts.has_key(kw):
                    self._map_kw_counts[kw] += 1
                else:
                    self._map_kw_counts[kw] = 1
        #filter dataset depending on count
        if mpi.is_root():
            ind_keep = []
            kw_zero = 0
            for i in range(len(self._keyword)):
                kw = self._keyword[i]
                kw_count = 0
                if self._map_kw_counts.has_key(kw):
                    kw_count = self._map_kw_counts[kw]
                else:
                    kw_zero += 1
                if kw_count <= max_count and kw_count >= min_count:
                    ind_keep.append(i)
            
            self._data = [self._data[i] for i in ind_keep]
            self._label = [self._label[i] for i in ind_keep]
            self._features = [self._features[i] for i in ind_keep]
            self._utt_id = [self._utt_id[i] for i in ind_keep]
            self._times = [self._times[i] for i in ind_keep]
            self._keyword = [self._keyword[i] for i in ind_keep]

                    
        self._data = mpi.distribute_list(self._data)
        self._label = mpi.distribute(self._label)
        self._features = mpi.distribute_list(self._features)
        self._utt_id = mpi.distribute_list(self._utt_id)
        self._times = mpi.distribute_list(self._times)
        self._keyword = mpi.distribute_list(self._keyword)
        if self.keep_full_utt == True:
            self.utt_reader = utt_reader
        if kw_feat != None:
            try:
                kw_feat.has_key('length')
                self.CopyKeywordMaps(kw_feat)
            except:
                self.LoadMappingHescii(FLAGS.hescii_file)
                self.ComputeKeywordMaps()
Beispiel #40
0
 def testIsRoot(self):
     if mpi.RANK == 0:
         self.assertTrue(mpi.is_root())
     else:
         self.assertFalse(mpi.is_root())
std.resize(np.prod(regions_pooled.shape[1:-1]), regions_pooled.shape[-1])
std = std.mean(axis=0)
std_order = np.argsort(std)

# now, compute the within-class std
regions_pooled_view = regions_pooled.reshape(regions_pooled.shape[0],
        np.prod(regions_pooled.shape[1:-1]), regions_pooled.shape[-1])
within_std_local = regions_pooled_view.var(axis=1)
print within_std_local.shape
within_std = np.sqrt(mathutil.mpi_mean(within_std_local))
within_std_order = np.argsort(within_std)

std_comparison = within_std / (std + 1e-10)
std_comparison_order = np.argsort(std_comparison)

if mpi.is_root():
    pyplot.figure()
    visualize.show_multiple(conv[-2].dictionary[std_order])
    pyplot.savefig("codes_std_ordered.pdf")
    pyplot.figure()
    visualize.show_multiple(conv[-2].dictionary[within_std_order])
    pyplot.savefig("codes_within_std_ordered.pdf")
    pyplot.figure()
    visualize.show_multiple(conv[-2].dictionary[std_comparison_order])
    pyplot.savefig("codes_std_comparison_ordered.pdf")
    pyplot.figure()
    pyplot.plot(std)
    pyplot.show()
mpi.barrier()

def apcluster_k(feature, num_centers, corr = True, tol = 0):
    """perform the affinity propagation algorithm for the input codes.
    """
    logging.debug("ap: preparing similarity matrix")
    covmat = mathutil.mpi_cov(feature)
    std = np.diag(covmat)
    # normalize
    std = np.sqrt(std**2 + 0.01)
    if corr:
        # compute correlation. If corr is False, we will use the covariance
        # directly
        covmat /= std
        covmat /= std[:, np.newaxis]
    # compute the similarity matrix
    norm = np.diag(covmat) / 2
    covmat -= norm
    covmat -= norm[:, np.newaxis]
    # add a small noise to covmat
    noise = (covmat + np.finfo(np.float64).eps) * \
            np.random.rand(covmat.shape[0], covmat.shape[1])
    mpi.COMM.Bcast(noise)
    covmat += noise
    # The remaining part can just be carried out on root
    if mpi.is_root():
        # set preference
        pmax = covmat.max()
        #af = AffinityPropagation().fit(covmat, pmax)
        #num_max = len(af.cluster_centers_indices_)
        # in fact, num_max would always be covmat.shape[0] so we don't really
        # run ap
        num_max = covmat.shape[0]
        logging.debug("ap: pmax = %s, num = %d" % (pmax, num_max))
        pmin = covmat.min()
        af = AffinityPropagation().fit(covmat, pmin)
        # num_min is the theoretical min, but the python code seem to raise bugs...
        num_min = len(af.cluster_centers_indices_)
        logging.debug("ap: pmin = %s, num = %d" % (pmin, num_min))
        
        if num_centers < num_min:
            logging.warning("num_centers too small, will return %d centers" % (num_min,))
            return af.cluster_centers_indices_, af.labels_, covmat
    
        if num_centers > num_max:
            logging.warning("num_centers too large, will return everything.")
            return np.arange(covmat.shape[0], dtype=np.int), \
                   np.arange(covmat.shape[0], dtype=np.int)
        
        logging.debug("ap: start affinity propagation")
        
        # We will simply use bisection search to find the right number of centroids.
        for i in range(_AP_MAX_ITERATION):
            pref = (pmax + pmin) / 2
            af = AffinityPropagation().fit(covmat, pref)
            num = len(af.cluster_centers_indices_)
            logging.debug("ap try %d: pref = %s, num = %s" % (i + 1, pref, num))
            if num >= num_centers - tol and num <= num_centers + tol:
                break
            elif num < num_centers:
                pmin = pref
                num_min = num
            else:
                pmax = pref
                num_max = num
    else:
        af = None
    mpi.barrier()
    af = mpi.COMM.bcast(af)
    return af.cluster_centers_indices_, af.labels_, covmat
def cifar_demo():
    """Performs a demo classification on cifar
    """
    mpi.mkdir(FLAGS.output_dir)
    logging.info('Loading cifar data...')
    cifar = visiondata.CifarDataset(FLAGS.root, is_training=True)
    cifar_test = visiondata.CifarDataset(FLAGS.root, is_training=False)
    conv = pipeline.ConvLayer([
        pipeline.PatchExtractor([6, 6], 1),  # extracts patches
        pipeline.MeanvarNormalizer({'reg': 10}),  # normalizes the patches
        pipeline.LinearEncoder({}, trainer=pipeline.ZcaTrainer(
            {'reg': 0.1})),  # Does whitening
        pipeline.ThresholdEncoder({
            'alpha': 0.25,
            'twoside': True
        },
                                  trainer=pipeline.OMPTrainer({
                                      'k': 800,
                                      'max_iter': 100
                                  })),  # does encoding
        pipeline.SpatialPooler({
            'grid': (2, 2),
            'method': 'ave'
        })  # average pool
    ])
    logging.info('Training the pipeline...')
    conv.train(cifar, 50000)
    logging.info('Dumping the pipeline...')
    if mpi.is_root():
        with open(os.path.join(FLAGS.output_dir, FLAGS.model_file),
                  'w') as fid:
            pickle.dump(conv, fid)
            fid.close()
    with open(os.path.join(FLAGS.output_dir, FLAGS.model_file), 'r') as fid:
        conv = pickle.load(fid)
    logging.info('Extracting features...')
    Xtrain = conv.process_dataset(cifar, as_2d=True)
    mpi.dump_matrix_multi(
        Xtrain, os.path.join(FLAGS.output_dir, FLAGS.feature_file + '_train'))
    Ytrain = cifar.labels().astype(np.int)
    Xtest = conv.process_dataset(cifar_test, as_2d=True)
    mpi.dump_matrix_multi(
        Xtest, os.path.join(FLAGS.output_dir, FLAGS.feature_file + '_test'))
    Ytest = cifar_test.labels().astype(np.int)

    # normalization
    m, std = classifier.feature_meanstd(Xtrain)
    Xtrain -= m
    Xtrain /= std
    Xtest -= m
    Xtest /= std

    w, b = classifier.l2svm_onevsall(Xtrain, Ytrain, 0.01)
    if mpi.is_root():
        with open(os.path.join(FLAGS.output_dir, FLAGS.svm_file), 'w') as fid:
            pickle.dump({'m': m, 'std': std, 'w': w, 'b': b}, fid)
    accu = np.sum(Ytrain == (np.dot(Xtrain,w)+b).argmax(axis=1)) \
            / float(len(Ytrain))
    accu_test = np.sum(Ytest == (np.dot(Xtest,w)+b).argmax(axis=1)) \
            / float(len(Ytest))

    logging.info('Training accuracy: %f' % accu)
    logging.info('Testing accuracy: %f' % accu_test)
Beispiel #44
0
 def __init__(self,
              root_folder,
              extensions,
              prefetch=False,
              target_size=None,
              max_size=None,
              min_size=None,
              center_crop=None):
     """ Initialize from a two-layer storage
     Input:
         root_folder: the root that contains the data. Under root_folder
             there should be a list of folders, under which there should be
             a list of files
         extensions: the list of extensions that should be used to filter the
             files. Should be like ['png', 'jpg']. It's case insensitive.
         prefetch: if True, the images are prefetched to avoid disk read. If
             you have a large number of images, prefetch would require a lot
             of memory.
         target_size, max_size, min_size, center_crop: see manipulate() for
             details.
     """
     super(TwoLayerDataset, self).__init__()
     if mpi.agree(not os.path.exists(root_folder)):
         raise OSError, "The specified folder does not exist."
     logging.debug('Loading from %s' % (root_folder, ))
     if type(extensions) is str:
         extensions = [extensions]
     extensions = set(extensions)
     if mpi.is_root():
         # get files first
         files = glob.glob(os.path.join(root_folder, '*', '*'))
         # select those that fits the extension
         files = [
             f for f in files
             if any([f.lower().endswith(ext) for ext in extensions])
         ]
         logging.debug("A total of %d images." % (len(files)))
         # get raw labels
         labels = [os.path.split(os.path.split(f)[0])[1] for f in files]
         classnames = list(set(labels))
         # sort so we get a reasonable class order
         classnames.sort()
         name2val = dict(zip(classnames, range(len(classnames))))
         labels = [name2val[label] for label in labels]
     else:
         files = None
         classnames = None
         labels = None
     mpi.barrier()
     self._rawdata = mpi.distribute_list(files)
     self._data = self._rawdata
     self._prefetch = prefetch
     self._target_size = target_size
     self._max_size = max_size
     self._min_size = min_size
     self._center_crop = center_crop
     if target_size != None:
         self._dim = tuple(target_size) + (3, )
     else:
         self._dim = False
     self._channels = 3
     if prefetch:
         self._data = [self._read(idx) for idx in range(len(self._data))]
     self._label = mpi.distribute_list(labels)
     self._classnames = mpi.COMM.bcast(classnames)
Beispiel #45
0
def apcluster_k(feature, num_centers, corr=True, tol=0):
    """perform the affinity propagation algorithm for the input codes.
    """
    logging.debug("ap: preparing similarity matrix")
    covmat = mathutil.mpi_cov(feature)
    std = np.diag(covmat)
    # normalize
    std = np.sqrt(std**2 + 0.01)
    if corr:
        # compute correlation. If corr is False, we will use the covariance
        # directly
        covmat /= std
        covmat /= std[:, np.newaxis]
    # compute the similarity matrix
    norm = np.diag(covmat) / 2
    covmat -= norm
    covmat -= norm[:, np.newaxis]
    # add a small noise to covmat
    noise = (covmat + np.finfo(np.float64).eps) * \
            np.random.rand(covmat.shape[0], covmat.shape[1])
    mpi.COMM.Bcast(noise)
    covmat += noise
    # The remaining part can just be carried out on root
    if mpi.is_root():
        # set preference
        pmax = covmat.max()
        #af = AffinityPropagation().fit(covmat, pmax)
        #num_max = len(af.cluster_centers_indices_)
        # in fact, num_max would always be covmat.shape[0] so we don't really
        # run ap
        num_max = covmat.shape[0]
        logging.debug("ap: pmax = %s, num = %d" % (pmax, num_max))
        pmin = covmat.min()
        af = AffinityPropagation().fit(covmat, pmin)
        # num_min is the theoretical min, but the python code seem to raise bugs...
        num_min = len(af.cluster_centers_indices_)
        logging.debug("ap: pmin = %s, num = %d" % (pmin, num_min))

        if num_centers < num_min:
            logging.warning("num_centers too small, will return %d centers" %
                            (num_min, ))
            return af.cluster_centers_indices_, af.labels_, covmat

        if num_centers > num_max:
            logging.warning("num_centers too large, will return everything.")
            return np.arange(covmat.shape[0], dtype=np.int), \
                   np.arange(covmat.shape[0], dtype=np.int)

        logging.debug("ap: start affinity propagation")

        # We will simply use bisection search to find the right number of centroids.
        for i in range(_AP_MAX_ITERATION):
            pref = (pmax + pmin) / 2
            af = AffinityPropagation().fit(covmat, pref)
            num = len(af.cluster_centers_indices_)
            logging.debug("ap try %d: pref = %s, num = %s" %
                          (i + 1, pref, num))
            if num >= num_centers - tol and num <= num_centers + tol:
                break
            elif num < num_centers:
                pmin = pref
                num_min = num
            else:
                pmax = pref
                num_max = num
    else:
        af = None
    mpi.barrier()
    af = mpi.COMM.bcast(af)
    return af.cluster_centers_indices_, af.labels_, covmat
Beispiel #46
0
 def testIsRoot(self):
     if mpi.RANK == 0:
         self.assertTrue(mpi.is_root())
     else:
         self.assertFalse(mpi.is_root())
Beispiel #47
0
    eigval, eigvec = np.linalg.eigh(covmat)
    U = eigvec[:, -(200*FLAGS.grid*FLAGS.grid):]
    #U = eigvec[:,-400:] * np.sqrt(eigval[-400:])
    Xtrain = np.dot(Xtrain, U)
    Xtest = np.dot(Xtest, U)
    """

    w, b = classifier.l2svm_onevsall(Xtrain,
                                     Ytrain,
                                     0.002,
                                     fminargs={
                                         'disp': 0,
                                         'maxfun': 1000
                                     })
    accu_train = classifier.Evaluator.accuracy(Ytrain, np.dot(Xtrain, w) + b)
    accu_test = classifier.Evaluator.accuracy(Ytest, np.dot(Xtest, w) + b)
    logging.info('Training accuracy: %f' % accu_train)
    logging.info('Testing accuracy: %f' % accu_test)


if __name__ == "__main__":
    gflags.FLAGS(sys.argv)
    if mpi.is_root():
        logging.basicConfig(level=logging.DEBUG)
        if FLAGS.profile_file != "":
            cProfile.run('cifar_demo()', FLAGS.profile_file)
        else:
            cifar_demo()
    else:
        cifar_demo()
Beispiel #48
0
def kmeans(X, k, n_init=1, max_iter=300, tol=1e-4):
    """ K-means clustering algorithm.

    Parameters
    ----------
    X: ndarray
        A M by N array of M observations in N dimensions. X in every MPI node
        is the local data points it is responsible for.

    k: int or ndarray
        The number of clusters to form.

    n_init: int, optional, default: 1
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.

    max_iter: int, optional, default 300
        Maximum number of iterations of the k-means algorithm to run.

    tol: float, optional
        The relative increment in the results before declaring convergence.

    Returns
    -------
    centroid: ndarray
        A k by N array of centroids found at the last iteration of
        k-means.

    label: ndarray
        label[i] is the code or index of the centroid the
        i'th observation is closest to.

    inertia: float
        The final value of the inertia criterion

    """
    # do k-means training
    # vdata helps the stop criterion
    vdata = mpi.COMM.allreduce(np.mean(np.var(X, 0))) / mpi.SIZE
    best_inertia = np.infty

    if k <= 0:
        raise ValueError, "The number of centers (%d) should be positive." % k
    if mpi.COMM.allreduce(X.shape[0], op=mpi.MPI.MIN) == 0:
        raise RuntimeError, "Some nodes has zero data."

    logging.debug("Kmeans: A total of %d data points." % \
                  mpi.COMM.allreduce(X.shape[0]))
    # pre-compute squared norms of data points
    x_squared_norms = (X**2).sum(axis=1)
    for init_count in range(n_init):
        logging.debug("Kmeans trial %d" % (init_count, ))
        # initialization
        centers = X[np.random.randint(X.shape[0], size=k)]
        centers_all = mpi.COMM.gather(centers)
        if mpi.is_root():
            centers_all = np.vstack(centers_all)
            centers[:] = centers_all[np.random.permutation(
                centers_all.shape[0])[:k]]
        mpi.COMM.Bcast(centers)

        # iterations
        for iter_id in range(max_iter):
            logging.debug("Kmeans iter %d" % (iter_id))
            centers_old = centers.copy()
            labels, inertia = _e_step(X,
                                      centers,
                                      x_squared_norms=x_squared_norms)
            inertia = mpi.COMM.allreduce(inertia)
            logging.debug("Inertia %f" % (inertia), )
            centers = _m_step(X, labels, k)
            # test convergence
            converged = (np.sum((centers_old - centers)**2) < tol * vdata)
            if mpi.agree(converged):
                break

        if inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia
    return best_centers, best_labels, best_inertia