def test_preprocess():
    """
    Tests that `preprocess` fills in environment variables using various
    interfaces and raises a ValueError if a needed environment variable
    definition is missing.
    """
    try:
        keys = ["PYLEARN2_" + str(uuid.uuid1())[:8] for _ in xrange(3)]
        strs = ["${%s}" % k for k in keys]
        os.environ[keys[0]] = keys[1]
        # Test with os.environ only.
        assert preprocess(strs[0]) == keys[1]
        # Test with provided dict only.
        assert preprocess(strs[1], environ={keys[1]: keys[2]}) == keys[2]
        # Provided overrides os.environ.
        assert preprocess(strs[0], environ={keys[0]: keys[2]}) == keys[2]
        raised = False
        try:
            preprocess(strs[2], environ={keys[1]: keys[0]})
        except ValueError:
            raised = True
        assert raised

    finally:
        for key in keys:
            if key in os.environ:
                del os.environ[key]
Example #2
0
def get_key(config_file = '${HOME}/.key_chain'):
    """
    read and returns auth key from config file
    """

    config_file = preprocess(config_file)
    f = open(config_file)
    config = ConfigParser.RawConfigParser()
    config.read(preprocess(config_file))
    return config.get('mashape', 'key')
Example #3
0
    def __init__(self):
        default_path = "${PYLEARN2_DATA_PATH}"
        local_path = "${PYLEARN2_LOCAL_DATA_PATH}"
        self.pid = os.getpid()

        try:
            self.dataset_remote_dir = string_utils.preprocess(default_path)
            self.dataset_local_dir = string_utils.preprocess(local_path)
        except (ValueError, string_utils.NoDataPathError, string_utils.EnvironmentVariableError):
            # Local cache seems to be deactivated
            self.dataset_remote_dir = ""
            self.dataset_local_dir = ""
Example #4
0
    def __init__(self, whichset, path=None):
        
        # here, final refers to the unlabled images from which
        # we should make predictions (images_test_rev1)
        # the train/test/valid sets come from images_training_rev1
        # bigtrain is just the whole unsplit images_traininng_rev1
        assert whichset in ['train','test','valid','final','bigtrain']
        self.whichset = whichset
        # this is the final desired shape
        # the original shape is 424, 424
        self.img_shape = (100,100,3)
        self.target_shape = (37,)

        if path is None:
            path = '${PYLEARN2_DATA_PATH}/galaxy-data/'
        
        # load data
        path = preprocess(path)
        file_n = "{}_arrays.h5".format(os.path.join(path, "h5", whichset))
        if os.path.isfile(file_n):
            # just open file
            self.h5file = tables.openFile(file_n, mode='r')
        else:
            # create file and fill with data
            self.first_time(whichset, path, file_n)

        #axes=('b', 0, 1, 'c') # not sure what this means
        #view_converter = DefaultViewConverter((100, 100, 3), axes)
        super(galaxy_zoo_dataset, self).__init__(X=root.images, y=root.targets,
                                                 axes=axes)
    def __init__(self,
                 path='../filtered-seizure-data', # base directory, location of directories of filtered hkl files
                 target='Dog_1', # target is added bot to the path and as a prefix to each file name
                 one_hot=False,
                 scale_option='usf',
                 nwindows=60,
                 skip=5,
                 window_size=None,
                 expect_labels = True):
        """
        .. todo::

            WRITEME
        """
        self.path = path
        self.target = target
        self.one_hot = one_hot
        self.scale_option = scale_option
        self.nwindows = nwindows
        self.expect_labels = expect_labels
        self.skip = skip

        self.view_converter = None
        self.Nsamples = 239766 # 10 min at 399.61 Hz
        if window_size is None:
            self.window_size = self.Nsamples // self.nwindows
        else:
            self.window_size = window_size

        # and go

        self.path = preprocess(self.path)
        X, y = self._load_data()

        super(MyPyLearn2Dataset, self).__init__(X=X, y=y)
Example #6
0
def load_ndarray_label(name):
    """
    Load the train,valid,test label data for the dataset `name` and return it
    in ndarray format.  This is only available for the toy dataset ule.

    Parameters
    ----------
    name : 'ule'
        Must be 'ule'

    Returns
    -------
    train_l. valid_l, test_l : ndarray
        Label data loaded

    """
    assert name in ['ule']

    common_path = os.path.join(
        preprocess('${PYLEARN2_DATA_PATH}'), 'UTLC', 'filetensor', name + '_')
    trname, vname, tename = [common_path + subset + '.tf'
                             for subset in ['trainl', 'validl', 'testl']]

    trainl = load_filetensor(trname)
    validl = load_filetensor(vname)
    testl = load_filetensor(tename)
    return trainl, validl, testl
Example #7
0
def Transform():
    """Test smaller version of convolutional_network.ipynb"""
    which_experiment = "S100"
    skip.skip_if_no_data()
    yaml_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
    data_dir = string_utils.preprocess("${PYLEARN2_DATA_PATH}")
    save_path = os.path.join(data_dir, "cifar10", "experiment_" + string.lower(which_experiment))
    base_save_path = os.path.join(data_dir, "cifar10")
    # Escape potential backslashes in Windows filenames, since
    # they will be processed when the YAML parser will read it
    # as a string
    # save_path.replace('\\', r'\\')

    yaml = open("{0}/experiment_base_transform.yaml".format(yaml_file_path), "r").read()
    hyper_params = {
        "batch_size": 64,
        "output_channels_h1": 64,
        "output_channels_h2": 128,
        "output_channels_h3": 600,
        "max_epochs": 100,
        "save_path": save_path,
        "base_save_path": base_save_path,
    }
    yaml = yaml % (hyper_params)
    train = yaml_parse.load(yaml)
    train.main_loop()
Example #8
0
def load(filepath, recurse_depth=0, retry=True):
    """
    Parameters
    ----------
    filepath : str
        A path to a file to load. Should be a pickle, Matlab, or NumPy
        file.
    recurse_depth : int
        End users should not use this argument. It is used by the function
        itself to implement the `retry` option recursively.
    retry : bool
        If True, will make a handful of attempts to load the file before
        giving up. This can be useful if you are for example calling
        show_weights.py on a file that is actively being written to by a
        training script--sometimes the load attempt might fail if the
        training script writes at the same time show_weights tries to
        read, but if you try again after a few seconds you should be able
        to open the file.

    Returns
    -------
    loaded_object : object
        The object that was stored in the file.

    ..todo

        Refactor to hide recurse_depth from end users
    """
    try:
        import joblib

        joblib_available = True
    except ImportError:
        joblib_available = False
    if recurse_depth == 0:
        filepath = preprocess(filepath)

    if filepath.endswith(".npy") or filepath.endswith(".npz"):
        return np.load(filepath)

    if filepath.endswith(".mat"):
        global io
        if io is None:
            import scipy.io

            io = scipy.io
        try:
            return io.loadmat(filepath)
        except NotImplementedError, nei:
            if str(nei).find("HDF reader") != -1:
                global hdf_reader
                if hdf_reader is None:
                    import h5py

                    hdf_reader = h5py
                return hdf_reader.File(filepath)
            else:
                raise
        # this code should never be reached
        assert False
Example #9
0
    def __init__(self, dataset, model, algorithm=None, save_path=None,
                 save_freq=0, extensions=None, allow_overwrite=True):
        """
        Construct a Train instance.

        Parameters
        ----------
        dataset : `pylearn2.datasets.dataset.Dataset`
        model : `pylearn2.models.model.Model`
        algorithm : <Optional>
        `pylearn2.training_algorithms.training_algorithm.TrainingAlgorithm`
        save_path : <Optional> str
            Path to save (with pickle / joblib) the model.
        save_freq : <Optional> int
            Frequency of saves, in epochs. A frequency of zero disables
            automatic saving altogether. A frequency of 1 saves every
            epoch. A frequency of 2 saves every other epoch, etc.
            (default=0, i.e. never save). Note: when automatic saving is
            enabled (eg save_freq > 0), the model is always saved after
            learning, even when the final epoch is not a multiple of
            `save_freq`.
        extensions : <Optional> iterable
            A collection of `TrainExtension` objects whose callbacks are
            triggered at various points in learning.
        allow_overwrite : <Optional> bool
            If `True`, will save the model to save_path even if there is already
            something there. Otherwise, will raise an error if the `save_path`
            is already occupied.
        """
        self.allow_overwrite = allow_overwrite
        self.first_save = True
        self.dataset = dataset
        self.model = model
        self.algorithm = algorithm
        if save_path is not None:
            if save_freq == 0:
                warnings.warn('save_path specified but save_freq is 0 '
                              '(never save). Is this intentional?')
            self.save_path = preprocess(save_path)
        else:
            if save_freq > 0:
                phase_variable = 'PYLEARN2_TRAIN_PHASE'
                if phase_variable in os.environ:
                    phase = 'phase%d' % os.environ[phase_variable]
                    tokens = [os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'],
                              phase, 'pkl']
                else:
                    tokens = os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'], 'pkl'
                self.save_path = '.'.join(tokens)
        self.save_freq = save_freq

        if hasattr(self.dataset, 'yaml_src'):
            self.model.dataset_yaml_src = self.dataset.yaml_src
        else:
            warnings.warn("dataset has no yaml src, model won't know what " +
                          "data it was trained on")

        self.extensions = extensions if extensions is not None else []
        self.training_seconds = sharedX(value=0, name='training_seconds_this_epoch')
        self.total_seconds = sharedX(value=0, name='total_seconds_last_epoch')
Example #10
0
    def __init__(self, dataset, model, algorithm=None, save_path=None,
                 save_freq=0, extensions=None, allow_overwrite=True):
        self.allow_overwrite = allow_overwrite
        self.first_save = True
        self.dataset = dataset
        self.model = model
        self.algorithm = algorithm
        if save_path is not None:
            if save_freq == 0:
                warnings.warn('save_path specified but save_freq is 0 '
                              '(never save). Is this intentional?')
            self.save_path = preprocess(save_path)
        else:
            if save_freq > 0:
                phase_variable = 'PYLEARN2_TRAIN_PHASE'
                if phase_variable in os.environ:
                    phase = 'phase%d' % os.environ[phase_variable]
                    tokens = [os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'],
                              phase, 'pkl']
                else:
                    tokens = os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'], 'pkl'
                self.save_path = '.'.join(tokens)
        self.save_freq = save_freq

        if hasattr(self.dataset, 'yaml_src'):
            self.model.dataset_yaml_src = self.dataset.yaml_src
        else:
            warnings.warn("dataset has no yaml src, model won't know what " +
                          "data it was trained on")

        self.extensions = extensions if extensions is not None else []
        self.training_seconds = sharedX(value=0,
                                        name='training_seconds_this_epoch')
        self.total_seconds = sharedX(value=0, name='total_seconds_last_epoch')
    def __init__(self, path, n_labels=2, start=None, stop=None, del_raw=True, x_only=False):
        self.del_raw = del_raw
        path = preprocess(path)

        x, y = CSVDataset._load_data(path, del_raw=del_raw)
        if np.isnan(np.min(y)):
            y = None
        else:
            y = y.astype(int).reshape(-1, 1)

        if start is not None:
            if stop is None:
                stop = x.shape[0]
            assert start >= 0
            assert start < stop
            if not (stop <= x.shape[0]):
                raise ValueError("stop must be less than the # of examples but " +
                                 "stop is " + str(stop) + " and there are " + str(x.shape[0]) +
                                 " examples.")
            x = x[start:stop, :]
            if y is not None:
                y = y[start:stop, :]

        if x_only:
            y = None
            n_labels = None

        super(CSVDataset, self).__init__(X=x, y=y, y_labels=n_labels)
Example #12
0
    def _unpickle(cls, file):
        """
        .. todo::

            What is this? why not just use serial.load like the CIFAR-100
            class? Whoever wrote it shows up as "unknown" in git blame.
        """
        from pylearn2.utils import string_utils
        fname = os.path.join(string_utils.preprocess('${PYLEARN2_DATA_PATH}'),
                              'cifar10', 'cifar-10-batches-py', file)
        # fname = os.path.join('/Users/karino-t/data/cifar10/cifar-10-batches-py',file)
        if not os.path.exists(fname):
            raise IOError(fname+" was not found. You probably need to "
                          "download the CIFAR-10 dataset by using the "
                          "download script in "
                          "pylearn2/scripts/datasets/download_cifar10.sh "
                          "or manually from "
                          "http://www.cs.utoronto.ca/~kriz/cifar.html")
        fname = cache.datasetCache.cache_file(fname)

        _logger.info('loading file %s' % fname)
        fo = open(fname, 'rb')
        dict = cPickle.load(fo)
        fo.close()
        return dict
    def __init__(self, 
            path = 'train.csv',
            one_hot = False,
            expect_labels = True,
            expect_headers = True,
            delimiter = ',',
            col_number = 10):
        """
        .. todo::

            WRITEME
        """
        self.path = path
        self.one_hot = one_hot
        self.expect_labels = expect_labels
        self.expect_headers = expect_headers
        self.delimiter = delimiter
        self.col_number = col_number
        
        self.view_converter = None

        # and go

        self.path = preprocess(self.path)
        X, y = self._load_data()
        
        super(CSVModified, self).__init__(X=X, y=y)
Example #14
0
def load(filepath, recurse_depth=0):

    try:
        import joblib
        joblib_available = True
    except ImportError:
        joblib_available = False
    if recurse_depth == 0:
        filepath = preprocess(filepath)

    if filepath.endswith('.npy'):
        return np.load(filepath)

    if filepath.endswith('.mat'):
        global io
        if io is None:
            import scipy.io
            io = scipy.io
        try:
            return io.loadmat(filepath)
        except NotImplementedError, nei:
            if str(nei).find('HDF reader') != -1:
                global hdf_reader
                if hdf_reader is None:
                    import h5py
                    hdf_reader = h5py
                return hdf_reader.File(filepath)
            else:
                raise
        #this code should never be reached
        assert False
Example #15
0
def main():
    base = '${PYLEARN2_DATA_PATH}/esp_game/ESPGame100k/labels/'
    base = preprocess(base)
    paths = sorted(os.listdir(base))
    assert len(paths) == 100000

    words = {}

    for i, path in enumerate(paths):

        if i % 1000 == 0:
            print(i)
        path = base+path
        f = open(path, 'r')
        lines = f.readlines()
        for line in lines:
            word = line[: -1]
            if word not in words:
                words[word] = 1
            else:
                words[word] += 1

    ranked_words = sorted(words.keys(), key=lambda x: -words[x])

    ranked_words = [word_ + '\n' for word_ in ranked_words[0:4000]]

    f = open('wordlist.txt', 'w')
    f.writelines(ranked_words)
    f.close()
Example #16
0
    def __init__(self, save_dir):
        PYLEARN2_TRAIN_DIR = preprocess('${PYLEARN2_TRAIN_DIR}')
        PYLEARN2_TRAIN_BASE_NAME = preprocess('${PYLEARN2_TRAIN_BASE_NAME}')

        src = os.path.join(PYLEARN2_TRAIN_DIR, PYLEARN2_TRAIN_BASE_NAME)
        dst = os.path.join(save_dir, PYLEARN2_TRAIN_BASE_NAME)

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        if os.path.exists(save_dir) and not os.path.isdir(save_dir):
            raise IOError("save path %s exists, not a directory" % save_dir)
        elif not os.access(save_dir, os.W_OK):
            raise IOError("permission error creating %s" % dst)

        with log_timing(log, 'copying yaml from {} to {}'.format(src, dst)):
            copyfile(src, dst)
Example #17
0
 def __enter__(self):
     if isinstance(self._f, basestring):
         self._f = preprocess(self._f)
         self._handle = open(self._f, self._mode, self._buffering)
     else:
         self._handle = self._f
     return self._handle
Example #18
0
def show(image):
    """
    Parameters
    ----------
    image : PIL Image object or ndarray
        If ndarray, integer formats are assumed to use 0-255
        and float formats are assumed to use 0-1
    """
    if hasattr(image, '__array__'):
        #do some shape checking because PIL just raises a tuple indexing error
        #that doesn't make it very clear what the problem is
        if len(image.shape) < 2 or len(image.shape) > 3:
            raise ValueError('image must have either 2 or 3 dimensions but its shape is '+str(image.shape))

        if image.dtype == 'int8':
            image = np.cast['uint8'](image)
        elif str(image.dtype).startswith('float'):
            #don't use *=, we don't want to modify the input array
            image = image * 255.
            image = np.cast['uint8'](image)

        #PIL is too stupid to handle single-channel arrays
        if len(image.shape) == 3 and image.shape[2] == 1:
            image = image[:,:,0]

        try:
            ensure_Image()
            image = Image.fromarray(image)
        except TypeError:
            raise TypeError("PIL issued TypeError on ndarray of shape " +
                            str(image.shape) + " and dtype " +
                            str(image.dtype))


    try:
        f = NamedTemporaryFile(mode='r', suffix='.png', delete=False)
    except TypeError:
        # before python2.7, we can't use the delete argument
        f = NamedTemporaryFile(mode='r', suffix='.png')
        """
        TODO: prior to python 2.7, NamedTemporaryFile has no delete = False
        argument unfortunately, that means f.close() deletes the file.  we then
        save an image to the file in the next line, so there's a race condition
        where for an instant we  don't actually have the file on the filesystem
        reserving the name, and then write to that name anyway

        TODO: see if this can be remedied with lower level calls (mkstemp)
        """
        warnings.warn('filesystem race condition')

    name = f.name
    f.flush()
    f.close()
    image.save(name)
    viewer_command = string.preprocess('${PYLEARN2_VIEWER_COMMAND}')
    if os.name == 'nt':
        subprocess.Popen(viewer_command + ' ' + name +' && del ' + name, shell = True)
    else:
        subprocess.Popen(viewer_command + ' ' + name +' ; rm ' + name, shell = True)
Example #19
0
    def __init__(self, which_set,
            base_path = '${PYLEARN2_DATA_PATH}/hoge',
            start = None,
            stop = None,
            preprocessor = None,
            fit_preprocessor = False,
            axes = ('b', 0, 1, 'c'),
            fit_test_preprocessor = False):
        """
        which_set: A string specifying which portion of the dataset
            to load. Valid values are 'train' or 'public_test'
        base_path: The directory containing the .csv files from kaggle.com.
                This directory should be writable; if the .csv files haven't
                already been converted to npy, this class will convert them
                to save memory the next time they are loaded.
        fit_preprocessor: True if the preprocessor is allowed to fit the
                   data.
        fit_test_preprocessor: If we construct a test set based on this
                    dataset, should it be allowed to fit the test set?
        """

        self.test_args = locals()
        self.test_args['which_set'] = 'public_test'
        self.test_args['fit_preprocessor'] = fit_test_preprocessor
        del self.test_args['start']
        del self.test_args['stop']
        del self.test_args['self']

        files = {'train': 'train.csv', 'public_test' : 'test.csv'}

        try:
            filename = files[which_set]
        except KeyError:
            raise ValueError("Unrecognized dataset name: " + which_set)

        path = base_path + '/' + filename

        path = preprocess(path)

        X, y = self._load_data(path, which_set == 'train')


        if start is not None:
            assert which_set != 'test'
            assert isinstance(start, int)
            assert isinstance(stop, int)
            assert start >= 0
            assert start < stop
            assert stop <= X.shape[0]
            X = X[start:stop, :]
            if y is not None:
                y = y[start:stop, :]

        view_converter = DefaultViewConverter(shape=[48,48,1], axes=axes)

        super(HogeDataset, self).__init__(X=X, y=y, view_converter=view_converter)

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
Example #20
0
    def __init__(self, 
	    which_set = 'full',
            path = 'train.mat',
            one_hot = False,
	    colorspace = 'none',
	    step = 1,
	    start = None, 
	    stop = None,
	    center = False, 
	    rescale = False,
	    gcn = None,
	    toronto_prepro = False,
            axes=('b', 0, 1, 'c')):

        self.__dict__.update(locals())
        del self.self	

        #
        #self.one_hot = one_hot
	#self.colorspace = colorspace
	#self.step=step
	#self.which_set=which_set
        
        self.view_converter = None

        self.path = preprocess(self.path)
        X, y = self._load_data()

	if center:
            X -= 127.5
        #self.center = center

        if rescale:
            X /= 127.5
        #self.rescale = rescale
        
        if toronto_prepro:
            assert not center
            assert not gcn
            X = X / 255.
            if which_set == 'test':
                other = MATDATA(which_set='train')
                oX = other.X
                oX /= 255.
                X = X - oX.mean(axis=0)
            else:
                X = X - X.mean(axis=0)
        #self.toronto_prepro = toronto_prepro

        #self.gcn = gcn
        if gcn is not None:
            gcn = float(gcn)
            X = global_contrast_normalize(X, scale=gcn, min_divisor=1e-8)
	    
	view_converter = DefaultViewConverter((
	    self.windowSize,self.windowSize,self.channels), axes)
        
        super(MATDATA, self).__init__(X=X, y=y, view_converter=view_converter)
Example #21
0
    def _unpickle(cls, file):
        from pylearn2.utils import string_utils

        fname = os.path.join(string_utils.preprocess("${PYLEARN2_DATA_PATH}"), "cifar10", "cifar-10-batches-py", file)
        _logger.info("loading file %s" % fname)
        fo = open(fname, "rb")
        dict = cPickle.load(fo)
        fo.close()
        return dict
def main():
    data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10')

    print('Loading STL10-10 unlabeled and train datasets...')
    downsampled_dir = data_dir + '/stl10_32x32'

    data = serial.load(downsampled_dir + '/unlabeled.pkl')
    supplement = serial.load(downsampled_dir + '/train.pkl')

    print('Concatenating datasets...')
    data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0))
    del supplement

    print("Preparing output directory...")
    patch_dir = data_dir + '/stl10_patches_8x8'
    serial.mkdir(patch_dir)
    README = open(patch_dir + '/README', 'w')

    README.write(textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    data.pkl contains a pylearn2 Dataset object defining an unlabeled
    dataset of 2 million 6x6 approximately whitened, contrast-normalized
    patches drawn uniformly at random from a downsampled (to 32x32)
    version of the STL-10 train and unlabeled datasets.

    preprocessor.pkl contains a pylearn2 Pipeline object that was used
    to extract the patches and approximately whiten / contrast normalize
    them. This object is necessary when extracting features for
    supervised learning or test set classification, because the
    extracted features must be computed using inputs that have been
    whitened with the ZCA matrix learned and stored by this Pipeline.

    They were created with the pylearn2 script make_stl10_patches.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Preprocessing the data...")
    pipeline = preprocessing.Pipeline()
    pipeline.items.append(preprocessing.ExtractPatches(patch_shape=(8, 8),
                          num_patches=2*1000*1000))
    pipeline.items.append(
        preprocessing.GlobalContrastNormalization(sqrt_bias=10., use_std=True))
    pipeline.items.append(preprocessing.ZCA())
    data.apply_preprocessor(preprocessor=pipeline, can_fit=True)

    data.use_design_loc(patch_dir + '/data.npy')

    serial.save(patch_dir + '/data.pkl', data)

    serial.save(patch_dir + '/preprocessor.pkl', pipeline)
Example #23
0
    def __init__(self, which_set, stop=None):
        assert which_set in ['train', 'valid']
        self._stop = stop
        # TextDatasetMixin parameters
        self._unknown_index = 0
        self._end_of_word_index = 100
        self._case_sensitive = True
        with open(preprocess('${PYLEARN2_DATA_PATH}/word2vec/'
                             'char_vocab.pkl')) as f:
            self._vocabulary = cPickle.load(f)

        # Load the data
        with tables.open_file(preprocess('${PYLEARN2_DATA_PATH}/word2vec/'
                                         'characters.h5')) as f:
            node = f.get_node('/characters_%s' % which_set)
            # VLArray is strange, and this seems faster than reading node[:]
            if self._stop is not None:
                self.X = np.asarray([char_sequence[:, np.newaxis]
                                     for char_sequence in node[:self._stop]])
            else:
                self.X = np.asarray([char_sequence[:, np.newaxis]
                                     for char_sequence in node])
            # Format is [batch, time, data]

        with tables.open_file(preprocess('${PYLEARN2_DATA_PATH}/word2vec/'
                                         'embeddings.h5')) as f:
            node = f.get_node('/embeddings_%s' % which_set)
            if self._stop is not None:
                self.y = node[:self._stop]
            else:
                self.y = node[:]

        with open(preprocess('/data/lisatmp3/devincol/normalization.pkl')) as f:
            (means, stds) = cPickle.load(f)

        print "normalizing targets"
        self.y = (self.y - means)/stds

        source = ('features', 'targets')
        space = CompositeSpace([SequenceDataSpace(IndexSpace(dim=1,
                                                             max_labels=101)),
                                VectorSpace(dim=300)])
        super(Word2Vec, self).__init__(data=(self.X, self.y),
                                       data_specs=(space, source))
Example #24
0
	def __init__(self, 
			path = 'train',
			column = None,
			one_hot = False,
			with_labels = True,
			start = None,
			stop = None,
			preprocessor = None,
			fit_preprocessor = False,
			fit_test_preprocessor = False):
		"""
		which_set: A string specifying which portion of the dataset
			to load. Valid values are 'train' or 'public_test'
		base_path: The directory containing the .csv files from kaggle.com.
				This directory should be writable; if the .csv files haven't
				already been converted to npy, this class will convert them
				to save memory the next time they are loaded.
		fit_preprocessor: True if the preprocessor is allowed to fit the
				   data.
		fit_test_preprocessor: If we construct a test set based on this
					dataset, should it be allowed to fit the test set?
		"""

		# self._iter_targets = True	# whatever that means / won't work

		self.no_classes = 2

		# won't work TODO
		self.test_args = locals()
		self.test_args['which_set'] = 'test'
		self.test_args['fit_preprocessor'] = fit_test_preprocessor
		del self.test_args['start']
		del self.test_args['stop']
		del self.test_args['self']
		
		path = preprocess(path)
		X, y = self._load_data( path, column, with_labels )


		if start is not None:
			assert which_set != 'test'
			assert isinstance(start, int)
			assert isinstance(stop, int)
			assert start >= 0
			assert start < stop
			assert stop <= X.shape[0]
			X = X[start:stop, :]
			if y is not None:
				y = y[start:stop, :]


		super(TestDataset, self).__init__(X=X, y=y)

		if preprocessor:
			preprocessor.apply(self, can_fit=fit_preprocessor)
def main():
    data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar100')

    print('Loading CIFAR-100 train dataset...')
    train = CIFAR100(which_set='train', gcn=55.)

    print("Preparing output directory...")
    output_dir = data_dir + '/pylearn2_gcn_whitened'
    serial.mkdir(output_dir)
    README = open(output_dir + '/README', 'w')

    README.write(textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    train.pkl, and test.pkl each contain
    a pylearn2 Dataset object defining a labeled
    dataset of a 32x32 contrast normalized,
    approximately whitened version of the CIFAR-100 dataset.
    train.pkl contains labeled train examples.
    test.pkl contains labeled test examples.

    preprocessor.pkl contains a pylearn2 ZCA object that was used
    to approximately whiten the images. You may want to use this
    object later to preprocess other images.

    They were created with the pylearn2 script make_cifar100_gcn_whitened.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Learning the preprocessor \
           and preprocessing the unsupervised train data...")
    preprocessor = preprocessing.ZCA()
    train.apply_preprocessor(preprocessor=preprocessor, can_fit=True)

    print('Saving the training data')
    train.use_design_loc(output_dir+'/train.npy')
    serial.save(output_dir + '/train.pkl', train)

    print("Loading the test data")
    test = CIFAR100(which_set='test', gcn=55.)

    print("Preprocessing the test data")
    test.apply_preprocessor(preprocessor=preprocessor, can_fit=False)

    print("Saving the test data")
    test.use_design_loc(output_dir+'/test.npy')
    serial.save(output_dir+'/test.pkl', test)

    serial.save(output_dir + '/preprocessor.pkl', preprocessor)
Example #26
0
    def __init__(self, start=None, stop=None, axes=("b", 0, 1, "c"), stdev=0.8, hack=None, preproc="GCN"):
        #       self.translation_dict = OrderedDict({1: 'left_eyebrow_inner_end', 2: 'mouth_top_lip_bottom', 3: 'right_ear_canal', 4: 'right_ear_top', 5: 'mouth_top_lip', 6: 'mouth_bottom_lip_top', 7: 'right_eyebrow_center', 8: 'chin_left', 9: 'nose_tip', 10: 'left_eyebrow_center_top', 11: 'left_eye_outer_corner', 12: 'right_ear', 13: 'mouth_bottom_lip', 14: 'left_eye_center', 15: 'left_mouth_outer_corner', 16: 'left_eye_center_top', 17: 'left_ear_center', 18: 'nostrils_center', 19: 'right_eye_outer_corner', 20: 'right_eye_center_bottom', 21: 'chin_center', 22: 'left_eye_inner_corner', 23: 'right_mouth_outer_corner', 24: 'left_ear_bottom', 25: 'right_eye_center_top', 26: 'right_eyebrow_inner_end', 27: 'left_eyebrow_outer_end', 28: 'left_ear_top', 29: 'right_ear_center', 30: 'nose_center_top', 31: 'face_center', 32: 'right_eye_inner_corner', 33: 'right_eyebrow_center_top', 34: 'left_eyebrow_center', 35: 'right_eye_pupil', 36: 'right_ear_bottom', 37: 'mouth_left_corner', 38: 'left_eye_center_bottom', 39: 'left_eyebrow_center_bottom', 41: 'mouth_right_corner', 42: 'right_nostril', 43: 'right_eye_center', 44: 'chin_right', 45: 'right_eyebrow_outer_end', 46: 'left_eye_pupil', 47: 'mouth_center', 48: 'left_nostril', 49: 'right_eyebrow_center_bottom', 50: 'left_ear_canal', 51: 'left_ear', 52: 'face_right', 53: 'face_left'})

        self.name = hack
        self.stdev = stdev
        self.axes = axes
        self.pixels = numpy.arange(0, 96).reshape((1, 96))
        for i in xrange(len(keypoints_names) * 2 - 1):
            self.pixels = numpy.vstack((self.pixels, numpy.arange(0, 96).reshape((1, 96))))

        # self.which_set = which_set
        if hack is not None:
            X = LazyMemmap(
                preprocess("/Tmp/aggarwal/EmotiW_" + preproc + "_" + hack + ".npy"), dtype="float32", mode="c"
            )
        else:
            X = LazyMemmap(preprocess("${PYLEARN2_DATA_PATH}/faces/hdf5/complete_train_x.npy"), dtype="uint8", mode="c")

        Y = LazyMemmap(
            preprocess("${PYLEARN2_DATA_PATH}/faces/hdf5/complete_train_y.npy"), dtype=numpy.float32, mode="c"
        )

        num_examples = len(X) / (96.0 * 96.0 * 3.0)

        if stop is None:
            stop = num_examples
        if start is None:
            start = 0

        X = X.view()[start * 96 * 96 * 3 : stop * 96 * 96 * 3]
        Y = Y.view()[start * len(keypoints_names) * 2 : stop * len(keypoints_names) * 2]
        X.shape = (stop - start, 96 * 96 * 3)
        # print 'shape of X', X.mean(axis = 1).shape
        Y.shape = (stop - start, len(keypoints_names) * 2)
        if hack is not None:
            Y = self.make_targets(Y, hack)
        else:
            Y = self.make_targets(Y, "all")

        super(EmotiwKeypoints, self).__init__(
            X=X, y=Y, view_converter=DefaultViewConverter(shape=[96, 96, 3], axes=axes)
        )
Example #27
0
 def __init__(self, which_set, data_path=None, 
              term_range=None, target_type='cluster100'):
     """
     which_set: a string specifying which portion of the dataset
         to load. Valid values are 'train', 'valid' or 'test'
     data_path: a string specifying the directory containing the 
         webcluster data. If None (default), use environment 
         variable WEBCLUSTER_DATA_PATH.
     term_range: a tuple for taking only a slice of the available
         terms. Default is to use all 6275. For example, an input
         range of (10,2000) will truncate the 10 most frequent terms
         and the 6275-2000=4275 les frequent terms, whereby frequency
         we mean how many unique documents each term is in.
     target_type: the type of targets to use. Valid options are 
         'cluster[10,100,1000]'
     """
     self.__dict__.update(locals())
     del self.self
     
     self.corpus_terms = None
     self.doc_info = None
     
     print "loading WebCluster DDM. which_set =", self.which_set
     
     if self.data_path is None:
         self.data_path \
             = string_utils.preprocess('${WEBCLUSTER_DATA_PATH}')
     
     fname = os.path.join(self.data_path, which_set+'_doc_inputs.npy')
     X = np.load(fname)
     if self.term_range is not None:
         X = X[:,self.term_range[0]:self.term_range[1]]
         X = X/X.sum(1).reshape(X.shape[0],1)
     print X.sum(1).mean()
     
     fname = os.path.join(self.data_path, which_set+'_doc_targets.npy')
     # columns: 0:cluster10s, 1:cluster100s, 2:cluster1000s
     self.cluster_hierarchy = np.load(fname)
     
     y = None
     if self.target_type == 'cluster10':
         y = self.cluster_hierarchy[:,0]
     elif self.target_type == 'cluster100':
         y = self.cluster_hierarchy[:,1]
     elif self.target_type == 'cluster1000':
         y = self.cluster_hierarchy[:,2]
     elif self.target_type is None:
         pass
     else:
         raise NotImplementedError()
     
     DenseDesignMatrix.__init__(self, X=X, y=y)
     
     print "... WebCluster ddm loaded"
Example #28
0
            def get_relative_path(full_path):
                """
                Returns the relative path to the PYLEARN2_DATA_PATH.
                """
                data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}')

                if not memmap.filename.startswith(data_dir):
                    raise ValueError("Expected memmap.filename to start with "
                                     "the PYLEARN2_DATA_PATH (%s). Instead it "
                                     "was %s." % (data_dir, memmap.filename))

                return os.path.relpath(full_path, data_dir)
Example #29
0
 def _unpickle(cls, file):
     from pylearn2.utils import string_utils
     fname = os.path.join(
             string_utils.preprocess('${PYLEARN2_DATA_PATH}'),
             'cifar10',
             'cifar-10-batches-py',
             file)
     _logger.info('loading file %s' % fname)
     fo = open(fname, 'rb')
     dict = cPickle.load(fo)
     fo.close()
     return dict
Example #30
0
def _getVar(key, environ=None):
    """
    Looks for a key in custom and os environments.

    Parameters
    ----------
    key : str
        The key to look for.
    environ : dict, optional
        A custom dictionary to search befor system environment.

    Returns
    -------
        None if the key was not found, a string otherwise.
    """
    if environ:
        if environ.has_key(key):
            return string_utils.preprocess(environ[key], environ=environ)
    if os.environ.has_key():
        return string_utils.preprocess(os.environ[key])
    return None
Example #31
0
def save(filepath, obj, on_overwrite = 'ignore'):
    """
    Serialize `object` to a file denoted by `filepath`.

    Parameters
    ----------
    filepath : str
        A filename. If the suffix is `.joblib` and joblib can be
        imported, `joblib.dump` is used in place of the regular
        pickling mechanisms; this results in much faster saves by
        saving arrays as separate .npy files on disk. If the file
        suffix is `.npy` than `numpy.save` is attempted on `obj`.
        Otherwise, (c)pickle is used.

    obj : object
        A Python object to be serialized.

    on_overwrite : str, optional
        A string specifying what to do if the file already exists.
        Possible values include:

        - "ignore" : Just overwrite the existing file.
        - "backup" : Make a backup copy of the file (<filepath>.bak).
          Save the new copy. Then delete the backup copy. This allows
          recovery of the old version of the file if saving the new one
          fails.
    """
    filepath = preprocess(filepath)

    if os.path.exists(filepath):
        if on_overwrite == 'backup':
            backup = filepath + '.bak'
            shutil.move(filepath, backup)
            save(filepath, obj)
            try:
                os.remove(backup)
            except Exception, e:
                warnings.warn("Got an error while traing to remove "+backup+":"+str(e))
            return
        else:
            assert on_overwrite == 'ignore'
Example #32
0
def load(stream, overrides=None, **kwargs):
    """
    Loads a YAML configuration from a string or file-like object.

    Parameters
    ----------
    stream : str or object
        Either a string containing valid YAML or a file-like object
        supporting the .read() interface.
    overrides : dict, optional
        A dictionary containing overrides to apply. The location of
        the override is specified in the key as a dot-delimited path
        to the desired parameter, e.g. "model.corruptor.corruption_level".

    Returns
    -------
    graph : dict or object
        The dictionary or object (if the top-level element specified an
        Python object to instantiate).

    Notes
    -----
    Other keyword arguments are passed on to `yaml.load`.
    """
    global is_initialized
    if not is_initialized:
        initialize()

    if isinstance(stream, basestring):
        string = stream
    else:
        string = '\n'.join(stream.readlines())

    processed_string = preprocess(string)

    proxy_graph = yaml.load(processed_string, **kwargs)

    #import pdb; pdb.set_trace()
    if overrides is not None:
        handle_overrides(proxy_graph, overrides)
    return instantiate_all(proxy_graph)
Example #33
0
def _instantiate(proxy, bindings=None):
    """
    Instantiate a (hierarchy of) Proxy object(s).

    Parameters
    ----------
    proxy : object
        A `Proxy` object or list/dict/literal. Strings are run through
        `preprocess`.
    bindings : dict, opitonal
        A dictionary mapping previously instantiated `Proxy` objects
        to their instantiated values.

    Returns
    -------
    obj : object
        The result object from recursively instantiating the object DAG.

    Notes
    -----
    This should not be considered part of the stable, public API.
    """
    if bindings is None:
        bindings = {}
    if isinstance(proxy, Proxy):
        return _instantiate_proxy_tuple(proxy, bindings)
    elif isinstance(proxy, dict):
        # Recurse on the keys too, for backward compatibility.
        # Is the key instantiation feature ever actually used, by anyone?
        return dict((_instantiate(k, bindings), _instantiate(v, bindings))
                    for k, v in proxy.iteritems())
    elif isinstance(proxy, list):
        return [_instantiate(v, bindings) for v in proxy]
    # In the future it might be good to consider a dict argument that provides
    # a type->callable mapping for arbitrary transformations like this.
    elif isinstance(proxy, basestring):
        return preprocess(proxy)
    else:
        return proxy
Example #34
0
    def _unpickle(cls, file):
        """
        .. todo::

            What is this? why not just use serial.load like the CIFAR-100
            class? Whoever wrote it shows up as "unknown" in git blame.
        """
        from pylearn2.utils import string_utils
        fname = os.path.join(
                string_utils.preprocess('${PYLEARN2_DATA_PATH}'),
                'cifar10',
                'cifar-10-batches-py',
                file)
        if not os.path.exists(fname):
            raise IOError(fname+" was not found. You probably need to download "
                    "the CIFAR-10 dataset by using the download script in pylearn2/scripts/download_cifar10.sh "
                    "or manually from http://www.cs.utoronto.ca/~kriz/cifar.html")
        _logger.info('loading file %s' % fname)
        fo = open(fname, 'rb')
        dict = cPickle.load(fo)
        fo.close()
        return dict
Example #35
0
    def __init__(self, npy_filename, which_set, split):
        assert which_set in ['train', 'valid', 'test']

        self.split = split

        # Load data from .npy file
        npy_filename_root = os.path.join(preprocess('${PYLEARN2_DATA_PATH}'),
                                         'icml07data', 'npy', npy_filename)

        x_file = npy_filename_root + '_inputs.npy'
        y_file = npy_filename_root + '_labels.npy'
        x_file = datasetCache.cache_file(x_file)
        y_file = datasetCache.cache_file(y_file)
        data_x = np.load(x_file, mmap_mode='r')
        data_y = np.load(y_file, mmap_mode='r')

        # some sanity checkes
        assert np.isfinite(data_x).all()
        assert np.isfinite(data_y).all()
        assert data_x.shape[0] == data_y.shape[0]

        # extract
        n_train, n_valid, n_test = split
        sets = {
            'train': (0, n_train),
            'valid': (n_train, n_train + n_valid),
            'test': (n_train + n_valid, n_train + n_valid + n_test)
        }
        start, end = sets[which_set]

        data_x = data_x[start:end]
        data_y = data_y[start:end]

        view_converter = DefaultViewConverter((28, 28, 1))
        super(ICML07DataSet, self).__init__(X=data_x,
                                            y=data_y,
                                            y_labels=data_y.max() + 1,
                                            view_converter=view_converter)
Example #36
0
    def __init__(self, which_set, center=False, scale=False,
                 start=None, stop=None, axes=('b', 0, 1, 'c'),
                 preprocessor = None):

        assert which_set in self.mapper.keys()

        self.__dict__.update(locals())
        del self.self

        path = '${PYLEARN2_DATA_PATH}/SVHN/format2/'

        # load data
        path = preprocess(path)
        data_x, data_y = self.make_data(which_set, path)

        # rescale or center if permitted
        if center and scale:
            data_x -= 127.5
            data_x /= 127.5
        elif center:
            data_x -= 127.5
        elif scale:
            data_x /= 255.

        view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3),
                                                                  axes)
        super(SVHN_On_Memory, self).__init__(X=data_x, y=data_y,
                                             view_converter=view_converter)

        if preprocessor:
            if which_set in ['train', 'train_all', 'splitted_train']:
                can_fit = True
            else:
                can_fit = False
            preprocessor.apply(self, can_fit)

        del data_x, data_y
        gc.collect()
Example #37
0
 def __init__(self, which_set, standardize_quantitative=True,
              separate_types=False, prefix=None, one_hot=False):
     if separate_types:
         raise NotImplementedError("This won't work as long as this "
                                   "is a subset of DenseDesignMatrix")
     self._separate_types = separate_types
     self._standardize_quantitative = standardize_quantitative
     self._prefix = prefix
     self._one_hot = one_hot
     prefix = prefix if prefix is not None else "${PYLEARN2_DATA_PATH}"
     self._raw = load_covertype(
         preprocess(os.path.join(prefix, "covertype")),
         which_sets=which_set,
         separate_types=self._separate_types,
         standardize_quantitative=self._standardize_quantitative
     )
     labels = self._raw[which_set]['labels'] - 1  # 0 - 6, not 1 - 7
     if one_hot:
         labels = one_hot(labels, max_label=6)
     super(CoverType, self).__init__(
         X=self._raw[which_set]['features'],
         y=labels
     )
Example #38
0
def load_ndarray_label(name):
    """ Load the train,valid,test label data for the dataset `name` and return it in ndarray format.
        This is only available for the toy dataset ule.

    Parameters
    ----------
    name : 'ule'
        Must be 'ule'

    Returns
    -------
    train_l. valid_l, test_l : ndarray
        Label data loaded
    """
    assert name in ['ule']

    common_path = os.path.join(preprocess('${PYLEARN2_DATA_PATH}'), 'UTLC', 'filetensor', name+'_')
    trname,vname,tename = [common_path+subset+'.tf' for subset in ['trainl','validl','testl']]

    trainl = load_filetensor(trname)
    validl = load_filetensor(vname)
    testl = load_filetensor(tename)
    return trainl, validl, testl
Example #39
0
    def __init__(self,
                 path='train.mat',
                 start=None,
                 stop=None,
                 center=False,
                 rescale=False,
                 axes=('b', 0, 1, 'c'),
                 channels=4):

        self.__dict__.update(locals())
        del self.self

        self.filters = tables.Filters(complib='blosc', complevel=5)

        self.view_converter = None

        self.path = preprocess(self.path)

        X, y = self._load_data()

        self.windowSize = np.uint8(np.sqrt(X.shape[1] / 4))

        if center and rescale:
            X[:] -= 127.5
            X[:] /= 127.5
        elif center:
            X[:] -= 127.5
        elif rescale:
            X[:] /= 255.

        view_converter = DefaultViewConverter((61, 61, 4), axes)

        super(MATDATAPyTables, self).__init__(X=X,
                                              y=y,
                                              view_converter=view_converter)

        self.h5file.flush()
Example #40
0
    def __init__(self,
                 path,
                 expect_headers=False,
                 delimiter=",",
                 which_set="train"):
        """
    @param path: path of a data, should be a pkl file (str)
    @param expect_headers: if there is a header on the first row (bool)
    @param delimiter: delimiter of the data (str)
    @param which_set: specify which set is using (total, train, valid, test)
    """

        self.path = path
        self.delimiter = delimiter
        self.expect_headers = expect_headers
        self.which_set = which_set

        self.path = preprocess(self.path)

        X, y = self._load_data()

        start = 0
        end = X.shape[0]
        if self.which_set == "train":
            start = 0
            end *= 0.6
        elif self.which_set == "valid":
            start = end * 0.6
            end *= 0.8
        elif self.which_set == "test":
            start = end * 0.8

        X = X[start:end, :]
        y = y[start:end, :]

        super(IFDataset, self).__init__(X=X, y=y)
Example #41
0
def Transform():
    """Test smaller version of convolutional_network.ipynb"""
    which_experiment = 'S100'
    skip.skip_if_no_data()
    yaml_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
    data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}')
    save_path = os.path.join(data_dir, 'cifar10', 'experiment_'+string.lower(which_experiment))
    base_save_path = os.path.join(data_dir, 'cifar10')
    # Escape potential backslashes in Windows filenames, since
    # they will be processed when the YAML parser will read it
    # as a string
    #save_path.replace('\\', r'\\')

    yaml = open("{0}/experiment_base_transform.yaml".format(yaml_file_path), 'r').read()
    hyper_params = {'batch_size': 64,
                    'output_channels_h1': 64,
                    'output_channels_h2': 128,
                    'output_channels_h3': 600,
                    'max_epochs': 100,
                    'save_path': save_path,
                    'base_save_path' : base_save_path }
    yaml = yaml % (hyper_params)
    train = yaml_parse.load(yaml)
    train.main_loop()
Example #42
0
    def __init__(self,
                 which_set,
                 base_path='${PYLEARN2_DATA_PATH}/icml_2013_black_box',
                 start=None,
                 stop=None,
                 preprocessor=None,
                 fit_preprocessor=False,
                 fit_test_preprocessor=False):
        """
        which_set: A string specifying which portion of the dataset
            to load. Valid values are 'train' or 'public_test'
        base_path: The directory containing the .csv files from kaggle.com.
                This directory should be writable; if the .csv files haven't
                already been converted to npy, this class will convert them
                to save memory the next time they are loaded.
        fit_preprocessor: True if the preprocessor is allowed to fit the
                   data.
        fit_test_preprocessor: If we construct a test set based on this
                    dataset, should it be allowed to fit the test set?
        """

        self.test_args = locals()
        self.test_args['which_set'] = 'public_test'
        self.test_args['fit_preprocessor'] = fit_test_preprocessor
        del self.test_args['start']
        del self.test_args['stop']
        del self.test_args['self']

        files = {'train': 'train.csv', 'public_test': 'test.csv'}
        sizes = {'train': 1000, 'public_test': 10000, 'extra': 135735}

        if which_set == 'extra':
            path = base_path + '/' + 'extra_unsupervised_data.npy'
            X = serial.load(path).T
            y = None
        else:
            try:
                filename = files[which_set]
            except KeyError:
                raise ValueError("Unrecognized dataset name: " + which_set)

            path = base_path + '/' + filename

            path = preprocess(path)

            expect_labels = which_set == 'train'

            X, y = self._load_data(path, expect_labels)
        size = sizes[which_set]
        if X.shape[0] != size:
            raise ValueError("Expected " + str(size) + " examples, got " +
                             str(X.shape[0]))

        if start is not None:
            assert which_set != 'test'
            assert isinstance(start, int)
            assert isinstance(stop, int)
            assert start >= 0
            assert start < stop
            if not (stop <= X.shape[0]):
                raise ValueError(
                    "stop must be less than the # of examples but " +
                    "stop is " + str(stop) + " and there are " +
                    str(X.shape[0]) + " examples.")
            X = X[start:stop, :]
            if y is not None:
                y = y[start:stop, :]

        super(BlackBoxDataset, self).__init__(X=X, y=y, y_labels=9)

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
Example #43
0
    def __init__(self, which_set, path = None, center = False, scale = False,
            start = None, stop = None, axes = ('b', 0, 1, 'c'),
            preprocessor = None):
        """
        Only for faster access there is a copy of hdf5 file in
        PYLEARN2_DATA_PATH but it mean to be only readable.
        If you wish to modify the data, you should pass a local copy
        to the path argument.
        """

        assert which_set in self.mapper.keys()

        self.__dict__.update(locals())
        del self.self

        if path is None:
            path = '${PYLEARN2_DATA_PATH}/SVHN/format2/'
            mode = 'r'
        else:
            mode = 'r+'
            logging.warning("Because path is not same as PYLEARN2_DATA_PATH "\
                "be aware that data might have been modified or pre-processed.")

        if mode == 'r' and (scale or center or (start != None) or
                        (stop != None)):
            raise ValueError("Only for speed there is a copy of hdf5 " +\
                    "file in PYLEARN2_DATA_PATH but it meant to be only " +\
                    "readable. If you wish to modify the data, you should " +\
                    "pass a local copy to the path argument.")

        # load data
        path = preprocess(path)
        file_n = "{}_32x32.h5".format(os.path.join(path, "h5", which_set))
        if os.path.isfile(file_n):
            make_new = False
        else:
            make_new = True
            warnings.warn("Over riding existing file: {}".format(file_n))

        # if hdf5 file does not exist make them
        if make_new:
            self.filters = tables.Filters(complib='blosc', complevel=5)
            self.make_data(which_set, path)

        self.h5file = tables.openFile(file_n, mode = mode)
        data = self.h5file.getNode('/', "Data")

        if start != None or stop != None:
            self.h5file, data = self.resize(self.h5file, start, stop)

        # rescale or center if permitted
        if center and scale:
            data.X[:] -= 127.5
            data.X[:] /= 127.5
        elif center:
            data.X[:] -= 127.5
        elif scale:
            data.X[:] /= 255.

        view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3),
                                                                        axes)
        super(SVHN, self).__init__(X = data.X, y = data.y,
                                    view_converter = view_converter)

        if preprocessor:
            if which_set in ['train', 'train_all', 'splitted_train']:
                can_fit = True
            preprocessor.apply(self, can_fit)

        self.h5file.flush()
Example #44
0
python extract_layer_2_kmeans_features.py private_test
to extract features for the ICML 2013 multimodal learning contest's private test images
(which will be released 72 hours before the contest ends)
""")


if len(sys.argv) != 2:
    usage()
    print('(You used the wrong number of arguments)')
    quit(-1)

_, arg = sys.argv

if arg == 'public_test':
    base = preprocess(
        '${PYLEARN2_DATA_PATH}/icml_2013_multimodal/public_test_layer_1_features'
    )
    expected_num_images = 500
elif arg == 'private_test':
    base = preprocess(
        '${PYLEARN2_DATA_PATH}/icml_2013_multimodal/private_test_layer_1_features'
    )
    expected_num_images = 500
else:
    usage()
    print('Unrecognized argument value:', arg)
    print('Recognized values are: public_test, private_test')

outdir = base[:-len('layer_1_features')] + 'layer_2_features'
serial.mkdir(outdir)
Example #45
0
    def __init__(self,
                 which_set,
                 center=False,
                 rescale=False,
                 gcn=None,
                 start=None,
                 stop=None,
                 axes=('b', 0, 1, 'c'),
                 toronto_prepro=False,
                 preprocessor=None):
        # note: there is no such thing as the cifar10 validation set;
        # pylearn1 defined one but really it should be user-configurable
        # (as it is here)

        self.axes = axes

        # we define here:
        dtype = 'uint8'
        ntrain = 50000
        nvalid = 0  # artefact, we won't use it
        ntest = 10000

        # we also expose the following details:
        self.img_shape = (3, 32, 32)
        self.img_size = numpy.prod(self.img_shape)
        self.n_classes = 10
        self.label_names = [
            'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog',
            'horse', 'ship', 'truck'
        ]

        # prepare loading
        fnames = ['data_batch_%i' % i for i in range(1, 6)]
        datasets = {}
        datapath = os.path.join(
            string_utils.preprocess('${PYLEARN2_DATA_PATH}'), 'cifar10',
            'cifar-10-batches-py')
        for name in fnames + ['test_batch']:
            fname = os.path.join(datapath, name)
            if not os.path.exists(fname):
                raise IOError(fname + " was not found. You probably need to "
                              "download the CIFAR-10 dataset by using the "
                              "download script in "
                              "pylearn2/scripts/datasets/download_cifar10.sh "
                              "or manually from "
                              "http://www.cs.utoronto.ca/~kriz/cifar.html")
            datasets[name] = cache.datasetCache.cache_file(fname)

        lenx = int(numpy.ceil((ntrain + nvalid) / 10000.) * 10000)
        x = numpy.zeros((lenx, self.img_size), dtype=dtype)
        y = numpy.zeros((lenx, 1), dtype=dtype)

        # load train data
        nloaded = 0
        for i, fname in enumerate(fnames):
            _logger.info('loading file %s' % datasets[fname])
            data = serial.load(datasets[fname])
            x[i * 10000:(i + 1) * 10000, :] = data['data']
            y[i * 10000:(i + 1) * 10000, 0] = data['labels']
            nloaded += 10000
            if nloaded >= ntrain + nvalid + ntest:
                break

        # load test data
        _logger.info('loading file %s' % datasets['test_batch'])
        data = serial.load(datasets['test_batch'])

        # process this data
        Xs = {'train': x[0:ntrain], 'test': data['data'][0:ntest]}

        Ys = {'train': y[0:ntrain], 'test': data['labels'][0:ntest]}

        X = numpy.cast['float32'](Xs[which_set])
        y = Ys[which_set]

        if isinstance(y, list):
            y = numpy.asarray(y).astype(dtype)

        if which_set == 'test':
            assert y.shape[0] == 10000
            y = y.reshape((y.shape[0], 1))

        if center:
            X -= 127.5
        self.center = center

        if rescale:
            X /= 127.5
        self.rescale = rescale

        if toronto_prepro:
            assert not center
            assert not gcn
            X = X / 255.
            if which_set == 'test':
                other = CIFAR10(which_set='train')
                oX = other.X
                oX /= 255.
                X = X - oX.mean(axis=0)
            else:
                X = X - X.mean(axis=0)
        self.toronto_prepro = toronto_prepro

        self.gcn = gcn
        if gcn is not None:
            gcn = float(gcn)
            X = global_contrast_normalize(X, scale=gcn)

        if start is not None:
            # This needs to come after the prepro so that it doesn't
            # change the pixel means computed above for toronto_prepro
            assert start >= 0
            assert stop > start
            assert stop <= X.shape[0]
            X = X[start:stop, :]
            y = y[start:stop, :]
            assert X.shape[0] == y.shape[0]

        if which_set == 'test':
            assert X.shape[0] == 10000

        view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3),
                                                                  axes)

        super(CIFAR10, self).__init__(X=X,
                                      y=y,
                                      view_converter=view_converter,
                                      y_labels=self.n_classes)

        assert not contains_nan(self.X)

        if preprocessor:
            preprocessor.apply(self)
This script also translates the data to lie in [-127.5, 127.5] instead of
[0,255]. This makes it play nicer with some of pylearn's visualization tools.
"""

from __future__ import print_function

from theano.compat.six.moves import xrange
from pylearn2.datasets.stl10 import STL10
from pylearn2.datasets.preprocessing import Downsample
from pylearn2.utils import string_utils as string
from pylearn2.utils import serial
import numpy as np

print('Preparing output directory...')

data_dir = string.preprocess('${PYLEARN2_DATA_PATH}')
downsampled_dir = data_dir + '/stl10_32x32'
serial.mkdir(downsampled_dir)
README = open(downsampled_dir + '/README', 'w')

README.write("""
The .pkl files in this directory may be opened in python using
cPickle, pickle, or pylearn2.serial.load. They contain pylearn2
Dataset objects defining the STL-10 dataset, but downsampled to
size 32x32 and translated to lie in [-127.5, 127.5 ].

They were created with the pylearn2 script make_downsampled_stl10.py

All other files in this directory, including this README, were
created by the same script and are necessary for the other files
to function correctly.
Example #47
0
def show(image):
    """
    Parameters
    ----------
    image : PIL Image object or ndarray
        If ndarray, integer formats are assumed to use 0-255 \
        and float formats are assumed to use 0-1
    """
    if hasattr(image, '__array__'):
        #do some shape checking because PIL just raises a tuple indexing error
        #that doesn't make it very clear what the problem is
        if len(image.shape) < 2 or len(image.shape) > 3:
            raise ValueError('image must have either 2 or 3 dimensions but its'
                             ' shape is ' + str(image.shape))

        if image.dtype == 'int8':
            image = np.cast['uint8'](image)
        elif str(image.dtype).startswith('float'):
            #don't use *=, we don't want to modify the input array
            image = image * 255.
            image = np.cast['uint8'](image)

        #PIL is too stupid to handle single-channel arrays
        if len(image.shape) == 3 and image.shape[2] == 1:
            image = image[:,:,0]

        try:
            ensure_Image()
            image = Image.fromarray(image)
        except TypeError:
            raise TypeError("PIL issued TypeError on ndarray of shape " +
                            str(image.shape) + " and dtype " +
                            str(image.dtype))


    try:
        f = NamedTemporaryFile(mode='r', suffix='.png', delete=False)
    except TypeError:
        # before python2.7, we can't use the delete argument
        f = NamedTemporaryFile(mode='r', suffix='.png')
        """
        TODO: prior to python 2.7, NamedTemporaryFile has no delete = False
        argument unfortunately, that means f.close() deletes the file.  we then
        save an image to the file in the next line, so there's a race condition
        where for an instant we  don't actually have the file on the filesystem
        reserving the name, and then write to that name anyway

        TODO: see if this can be remedied with lower level calls (mkstemp)
        """
        warnings.warn('filesystem race condition')

    name = f.name
    f.flush()
    f.close()
    image.save(name)
    viewer_command = string.preprocess('${PYLEARN2_VIEWER_COMMAND}')
    if os.name == 'nt':
        subprocess.Popen(viewer_command + ' ' + name +' && del ' + name,
                         shell=True)
    else:
        subprocess.Popen(viewer_command + ' ' + name +' ; rm ' + name,
                         shell=True)
Example #48
0
"""
This script makes a dataset of 32x32 contrast normalized, approximately
whitened CIFAR-100 images.

"""

from pylearn2.utils import serial
from pylearn2.datasets import preprocessing
from pylearn2.utils import string_utils
from pylearn2.datasets.cifar100 import CIFAR100

data_dir = string_utils.preprocess('${PYLEARN2_DATA_PATH}/cifar100')

print 'Loading CIFAR-100 train dataset...'
train = CIFAR100(which_set = 'train', gcn = 55.)

print "Preparing output directory..."
output_dir = data_dir + '/pylearn2_gcn_whitened'
serial.mkdir( output_dir )
README = open(output_dir + '/README','w')

README.write("""
The .pkl files in this directory may be opened in python using
cPickle, pickle, or pylearn2.serial.load.

train.pkl, and test.pkl each contain
a pylearn2 Dataset object defining a labeled
dataset of a 32x32 contrast normalized, approximately whitened version of the CIFAR-100
dataset. train.pkl contains labeled train examples. test.pkl
contains labeled test examples.
Example #49
0
def _load(filepath, recurse_depth=0, retry=True):
    """
    Recursively tries to load a file until success or maximum number of
    attempts.
    Parameters
    ----------
    filepath : str
        A path to a file to load. Should be a pickle, Matlab, or NumPy
        file; or a .txt or .amat file that numpy.loadtxt can load.
    recurse_depth : int, optional
        End users should not use this argument. It is used by the function
        itself to implement the `retry` option recursively.
    retry : bool, optional
        If True, will make a handful of attempts to load the file before
        giving up. This can be useful if you are for example calling
        show_weights.py on a file that is actively being written to by a
        training script--sometimes the load attempt might fail if the
        training script writes at the same time show_weights tries to
        read, but if you try again after a few seconds you should be able
        to open the file.
    Returns
    -------
    loaded_object : object
        The object that was stored in the file.
    """
    try:
        import joblib
        joblib_available = True
    except ImportError:
        joblib_available = False
    if recurse_depth == 0:
        filepath = preprocess(filepath)

    if filepath.endswith('.npy') or filepath.endswith('.npz'):
        return np.load(filepath)

    if filepath.endswith('.amat') or filepath.endswith('txt'):
        try:
            return np.loadtxt(filepath)
        except Exception:
            reraise_as("{0} cannot be loaded by serial.load (trying "
                       "to use np.loadtxt)".format(filepath))

    if filepath.endswith('.mat'):
        global io
        if io is None:
            import scipy.io
            io = scipy.io
        try:
            return io.loadmat(filepath)
        except NotImplementedError as nei:
            if str(nei).find('HDF reader') != -1:
                global hdf_reader
                if hdf_reader is None:
                    import h5py
                    hdf_reader = h5py
                return hdf_reader.File(filepath, 'r')
            else:
                raise
        # this code should never be reached
        assert False

    # for loading PY2 pickle in PY3
    encoding = {'encoding': 'latin-1'} if six.PY3 else {}

    def exponential_backoff():
        if recurse_depth > 9:
            logger.info('Max number of tries exceeded while trying to open '
                        '{0}'.format(filepath))
            logger.info('attempting to open via reading string')
            with open(filepath, 'rb') as f:
                content = f.read()
            return cPickle.loads(content, **encoding)
        else:
            nsec = 0.5 * (2.0 ** float(recurse_depth))
            logger.info("Waiting {0} seconds and trying again".format(nsec))
            time.sleep(nsec)
            return _load(filepath, recurse_depth + 1, retry)

    try:
        if not joblib_available:
            with open(filepath, 'rb') as f:
                obj = cPickle.load(f, **encoding)
        else:
            try:
                obj = joblib.load(filepath)
            except Exception as e:
                if os.path.exists(filepath) and not os.path.isdir(filepath):
                    raise
                raise_cannot_open(filepath)
    except MemoryError as e:
        # We want to explicitly catch this exception because for MemoryError
        # __str__ returns the empty string, so some of our default printouts
        # below don't make a lot of sense.
        # Also, a lot of users assume any exception is a bug in the library,
        # so we can cut down on mail to pylearn-users by adding a message
        # that makes it clear this exception is caused by their machine not
        # meeting requirements.
        if os.path.splitext(filepath)[1] == ".pkl":
            improve_memory_error_message(e,
                                         ("You do not have enough memory to "
                                          "open %s \n"
                                          " + Try using numpy.{save,load} "
                                          "(file with extension '.npy') "
                                          "to save your file. It uses less "
                                          "memory when reading and "
                                          "writing files than pickled files.")
                                         % filepath)
        else:
            improve_memory_error_message(e,
                                         "You do not have enough memory to "
                                         "open %s" % filepath)

    except (BadPickleGet, EOFError, KeyError) as e:
        if not retry:
            reraise_as(e.__class__('Failed to open {0}'.format(filepath)))
        obj = exponential_backoff()
    except ValueError:
        logger.exception

        if not retry:
            reraise_as(ValueError('Failed to open {0}'.format(filepath)))
        obj = exponential_backoff()
    except Exception:
        # assert False
        reraise_as("Couldn't open {0}".format(filepath))

    # if the object has no yaml_src, we give it one that just says it
    # came from this file. could cause trouble if you save obj again
    # to a different location
    if not hasattr(obj, 'yaml_src'):
        try:
            obj.yaml_src = '!pkl: "' + os.path.abspath(filepath) + '"'
        except Exception:
            pass

    return obj
Example #50
0
    def __init__(self, which_set, center=False, custom_path=None):

        assert which_set in ['train', 'unlabeled', 'custom']

        path = "${PYLEARN2_DATA_PATH}/TLChallenge"

        if which_set == 'train':
            path += '/training/training-data.dat'
        elif which_set == 'unlabeled':
            path += '/unlabelled_tiny.dat'
        elif which_set == 'custom':
            path = custom_path

        path = preprocess(path)

        X = N.fromfile(path, dtype=N.uint8, sep=' ')

        X = X.reshape(X.shape[0] / (32 * 32 * 3), 32 * 32 * 3, order='F')

        assert X.max() == 255
        assert X.min() == 0

        X = N.cast['float32'](X)
        y = None

        if center:
            X -= 127.5

        view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3))

        X = view_converter.design_mat_to_topo_view(X)

        X = N.transpose(X, (0, 2, 1, 3))

        X = view_converter.topo_view_to_design_mat(X)

        super(TL_Challenge, self).__init__(X=X,
                                           y=y,
                                           view_converter=view_converter)

        assert not N.any(N.isnan(self.X))

        if which_set == 'train':
            self.y_fine = N.fromfile(preprocess(
                "${PYLEARN2_DATA_PATH}/TLChallenge/training/training-labels.dat"
            ),
                                     dtype=N.uint8,
                                     sep=' ')
            assert len(self.y_fine.shape) == 1
            assert self.y_fine.shape[0] == X.shape[0]
            #0 :  aquatic_mammals
            #1 :  fish
            #2 :  flowers
            FOOD_CONTAINER = 3
            FRUIT = 4
            #5 :  household_electrical_devices
            FURNITURE = 6
            INSECTS = 7
            #8 :  large_carnivores
            #9 :  large_man-made_outdoor_things
            #10 :  large_natural_outdoor_scenes
            LARGE_OMNIVORES_HERBIVORES = 11
            MEDIUM_MAMMAL = 12
            #13 :  non-insect_invertebrates
            #14 :  people
            #15 :  reptiles
            #16 :  small_mammals
            #17 :  trees
            #18 :  vehicles_1
            #19 :  vehicles_2

            self.y_coarse = self.y_fine.copy()
            self.y_coarse[self.y_coarse == 100] = INSECTS
            self.y_coarse[self.y_coarse == 101] = LARGE_OMNIVORES_HERBIVORES
            self.y_coarse[self.y_coarse == 102] = LARGE_OMNIVORES_HERBIVORES
            self.y_coarse[self.y_coarse == 103] = LARGE_OMNIVORES_HERBIVORES
            self.y_coarse[self.y_coarse == 104] = FRUIT
            self.y_coarse[self.y_coarse == 105] = FOOD_CONTAINER
            self.y_coarse[self.y_coarse == 106] = FRUIT
            self.y_coarse[self.y_coarse == 107] = MEDIUM_MAMMAL
            self.y_coarse[self.y_coarse == 108] = FRUIT
            self.y_coarse[self.y_coarse == 109] = FURNITURE

            assert self.y_coarse.min() == 3
            assert self.y_coarse.max() == 12

            for i in xrange(120):
                if self.y_coarse[i] == FRUIT:
                    assert self.y_fine[i] in [104, 106, 108]
Example #51
0
def save(filepath, obj, on_overwrite='ignore'):
    """
    Serialize `object` to a file denoted by `filepath`.

    Parameters
    ----------
    filepath : str
        A filename. If the suffix is `.joblib` and joblib can be
        imported, `joblib.dump` is used in place of the regular
        pickling mechanisms; this results in much faster saves by
        saving arrays as separate .npy files on disk. If the file
        suffix is `.npy` than `numpy.save` is attempted on `obj`.
        Otherwise, (c)pickle is used.

    obj : object
        A Python object to be serialized.

    on_overwrite: A string specifying what to do if the file already
                exists.
                ignore: just overwrite it
                backup: make a copy of the file (<filepath>.bak) and
                        delete it when done saving the new copy.
                        this allows recovery of the old version of
                        the file if saving the new one fails
    """

    filepath = preprocess(filepath)

    if os.path.exists(filepath):
        if on_overwrite == 'backup':
            backup = filepath + '.bak'
            shutil.move(filepath, backup)
            save(filepath, obj)
            os.remove(backup)
            return
        else:
            assert on_overwrite == 'ignore'

    try:
        _save(filepath, obj)
    except RuntimeError, e:
        """ Sometimes for large theano graphs, pickle/cPickle exceed the
            maximum recursion depth. This seems to me like a fundamental
            design flaw in pickle/cPickle. The workaround I employ here
            is the one recommended to someone who had a similar problem
            on stackexchange:

            http://stackoverflow.com/questions/2134706/hitting-maximum-recursion-depth-using-pythons-pickle-cpickle

            Obviously this does not scale and could cause a crash
            but I don't see another solution short of writing our
            own implementation of pickle.
        """
        if str(e).find('recursion') != -1:
            warnings.warn('pylearn2.utils.save encountered the following '
                          'error: ' + str(e) +
                          '\nAttempting to resolve this error by calling ' +
                          'sys.setrecusionlimit and retrying')
            old_limit = sys.getrecursionlimit()
            try:
                sys.setrecursionlimit(50000)
                _save(filepath, obj)
            finally:
                sys.setrecursionlimit(old_limit)
Example #52
0
    def __init__(self,
                 dataset,
                 model,
                 algorithm=None,
                 save_path=None,
                 save_freq=0,
                 extensions=None,
                 allow_overwrite=True):
        """
        Construct a Train instance.

        Parameters
        ----------
        dataset : `pylearn2.datasets.dataset.Dataset`
        model : `pylearn2.models.model.Model`
        algorithm : <Optional>
        `pylearn2.training_algorithms.training_algorithm.TrainingAlgorithm`
        save_path : <Optional> str
            Path to save (with pickle / joblib) the model.
        save_freq : <Optional> int
            Frequency of saves, in epochs. A frequency of zero disables
            automatic saving altogether. A frequency of 1 saves every
            epoch. A frequency of 2 saves every other epoch, etc.
            (default=0, i.e. never save). Note: when automatic saving is
            enabled (eg save_freq > 0), the model is always saved after
            learning, even when the final epoch is not a multiple of
            `save_freq`.
        extensions : <Optional> iterable
            A collection of `TrainExtension` objects whose callbacks are
            triggered at various points in learning.
        allow_overwrite : <Optional> bool
            If `True`, will save the model to save_path even if there is
            already something there. Otherwise, will raise an error if the
            `save_path` is already occupied.
        """
        self.allow_overwrite = allow_overwrite
        self.first_save = True
        self.dataset = dataset
        self.model = model
        self.algorithm = algorithm
        if save_path is not None:
            if save_freq == 0:
                warnings.warn('save_path specified but save_freq is 0 '
                              '(never save). Is this intentional?')
            self.save_path = preprocess(save_path)
        else:
            if save_freq > 0:
                phase_variable = 'PYLEARN2_TRAIN_PHASE'
                if phase_variable in os.environ:
                    phase = 'phase%d' % os.environ[phase_variable]
                    tokens = [
                        os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'], phase,
                        'pkl'
                    ]
                else:
                    tokens = os.environ['PYLEARN2_TRAIN_FILE_FULL_STEM'], 'pkl'
                self.save_path = '.'.join(tokens)
        self.save_freq = save_freq

        if hasattr(self.dataset, 'yaml_src'):
            self.model.dataset_yaml_src = self.dataset.yaml_src
        else:
            warnings.warn("dataset has no yaml src, model won't know what " +
                          "data it was trained on")

        self.extensions = extensions if extensions is not None else []
        self.training_seconds = sharedX(value=0,
                                        name='training_seconds_this_epoch')
        self.total_seconds = sharedX(value=0, name='total_seconds_last_epoch')
Example #53
0
    def __init__(
            self,
            which_set,
            base_path='/data/vision/billf/manifold-learning/DL/Data/icml_2013_emotions',
            start=None,
            stop=None,
            preprocessor=None,
            fit_preprocessor=False,
            axes=('b', 0, 1, 'c'),
            fit_test_preprocessor=False,
            randindex=None,
            trainindex=None):
        """
        which_set: A string specifying which portion of the dataset
            to load. Valid values are 'train' or 'public_test'
        base_path: The directory containing the .csv files from kaggle.com.
                This directory should be writable; if the .csv files haven't
                already been converted to npy, this class will convert them
                to save memory the next time they are loaded.
        fit_preprocessor: True if the preprocessor is allowed to fit the
                   data.
        fit_test_preprocessor: If we construct a test set based on this
                    dataset, should it be allowed to fit the test set?
        """

        self.test_args = locals()
        self.test_args['which_set'] = 'public_test'
        self.test_args['fit_preprocessor'] = fit_test_preprocessor
        del self.test_args['start']
        del self.test_args['stop']
        del self.test_args['self']

        files = {'train': 'train.csv', 'public_test': 'test.csv'}

        try:
            filename = files[which_set]
        except KeyError:
            raise ValueError("Unrecognized dataset name: " + which_set)

        path = base_path + '/' + filename

        path = preprocess(path)

        X, y = self._load_data(path, which_set == 'train')

        if start is not None:
            assert which_set != 'test'
            assert isinstance(start, int)
            assert isinstance(stop, int)
            assert start >= 0
            assert start < stop
            assert stop <= X.shape[0]
            X = X[start:stop, :]
            if y is not None:
                y = y[start:stop, :]
        """
        if trainindex:
            X_list_flipLR, X_list_flipUD = self.flipData(X)
            X = X + X_list_flipLR
            y = y + y
        """
        view_converter = DefaultViewConverter(shape=[48, 48, 1], axes=axes)

        super(EmotionsDataset, self).__init__(X=X,
                                              y=y,
                                              view_converter=view_converter)

        if preprocessor:
            preprocessor.apply(self, can_fit=fit_preprocessor)
Example #54
0
def show(image):
    """
    .. todo::

        WRITEME

    Parameters
    ----------
    image : PIL Image object or ndarray
        If ndarray, integer formats are assumed to use 0-255
        and float formats are assumed to use 0-1
    """
    if hasattr(image, '__array__'):
        #do some shape checking because PIL just raises a tuple indexing error
        #that doesn't make it very clear what the problem is
        if len(image.shape) < 2 or len(image.shape) > 3:
            raise ValueError('image must have either 2 or 3 dimensions but its'
                             ' shape is ' + str(image.shape))

        if image.dtype == 'int8':
            image = np.cast['uint8'](image)
        elif str(image.dtype).startswith('float'):
            #don't use *=, we don't want to modify the input array
            image = image * 255.
            image = np.cast['uint8'](image)

        #PIL is too stupid to handle single-channel arrays
        if len(image.shape) == 3 and image.shape[2] == 1:
            image = image[:, :, 0]

        try:
            ensure_Image()
            image = Image.fromarray(image)
        except TypeError:
            raise TypeError("PIL issued TypeError on ndarray of shape " +
                            str(image.shape) + " and dtype " +
                            str(image.dtype))

    # Create a temporary file with the suffix '.png'.
    fd, name = mkstemp(suffix='.png')
    os.close(fd)

    # Note:
    #   Although we can use tempfile.NamedTemporaryFile() to create
    #   a temporary file, the function should be used with care.
    #
    #   In Python earlier than 2.7, a temporary file created by the
    #   function will be deleted just after the file is closed.
    #   We can re-use the name of the temporary file, but there is an
    #   instant where a file with the name does not exist in the file
    #   system before we re-use the name. This may cause a race
    #   condition.
    #
    #   In Python 2.7 or later, tempfile.NamedTemporaryFile() has
    #   the 'delete' argument which can control whether a temporary
    #   file will be automatically deleted or not. With the argument,
    #   the above race condition can be avoided.
    #

    image.save(name)
    viewer_command = string.preprocess('${PYLEARN2_VIEWER_COMMAND}')
    if os.name == 'nt':
        subprocess.Popen(viewer_command + ' ' + name + ' && del ' + name,
                         shell=True)
    else:
        subprocess.Popen(viewer_command + ' ' + name + ' ; rm ' + name,
                         shell=True)
Example #55
0
    def __init__(self,
                 path='train.csv',
                 task='classification',
                 one_hot=False,
                 expect_labels=True,
                 expect_headers=True,
                 delimiter=',',
                 start=None,
                 stop=None,
                 start_fraction=None,
                 end_fraction=None):
        """
        .. todo::

            WRITEME
        """
        self.path = path
        self.task = task
        self.one_hot = one_hot
        self.expect_labels = expect_labels
        self.expect_headers = expect_headers
        self.delimiter = delimiter
        self.start = start
        self.stop = stop
        self.start_fraction = start_fraction
        self.end_fraction = end_fraction

        self.view_converter = None

        if task not in ['classification', 'regression']:
            raise ValueError('task must be either "classification" or '
                             '"regression"; got ' + str(task))

        if start_fraction is not None:
            if end_fraction is not None:
                raise ValueError("Use start_fraction or end_fraction, "
                                 " not both.")
            if start_fraction <= 0:
                raise ValueError("start_fraction should be > 0")

            if start_fraction >= 1:
                raise ValueError("start_fraction should be < 1")

        if end_fraction is not None:
            if end_fraction <= 0:
                raise ValueError("end_fraction should be > 0")

            if end_fraction >= 1:
                raise ValueError("end_fraction should be < 1")

        if start is not None:
            if start_fraction is not None or end_fraction is not None:
                raise ValueError("Use start, start_fraction, or end_fraction,"
                                 " just not together.")

        if stop is not None:
            if start_fraction is not None or end_fraction is not None:
                raise ValueError("Use stop, start_fraction, or end_fraction,"
                                 " just not together.")

        # and go

        self.path = preprocess(self.path)
        X, y = self._load_data()

        super(CSVDataset, self).__init__(X=X, y=y)
Example #56
0
or
python lcn.py private_test
to preprocess the ICML 2013 multimodal learning contest's private test images
(which will be released 72 hours before the contest ends)
"""


if len(sys.argv) != 2:
    usage()
    print '(You used the wrong number of arguments)'
    quit(-1)

_, arg = sys.argv

if arg == 'public_test':
    base = preprocess(
        '${PYLEARN2_DATA_PATH}/icml_2013_multimodal/public_test_images')
    outdir = base[:-6] + 'lcn'
    expected_num_images = 500
elif arg == 'private_test':
    base = preprocess(
        '${PYLEARN2_DATA_PATH}/icml_2013_multimodal/private_test_images')
    outdir = base[:-6] + 'lcn'
    expected_num_images = 500
else:
    usage()
    print 'Unrecognized argument value:', arg
    print 'Recognized values are: public_test, private_test'

serial.mkdir(outdir)

paths = os.listdir(base)
Example #57
0
    if len(sys.argv) < 5:
        print 'Usage: analysis.py <path/to/voter_models> <path/to/valid_images> <path/to/test_images> <batch>'
        sys.exit(-1)

    models_path = sys.argv[1]
    # 'data/food100/output_resized_64/img_61_*.jpg'
    valid_path = sys.argv[2]
    test_path = sys.argv[3]
    batch = int(sys.argv[4])

    class_to_id = get_classes()
    id_to_class = {v: k for k, v in class_to_id.items()}
    class_to_superclass = get_mapping()

    label_names_pkl_path = os.path.join(string_utils.preprocess('${PYLEARN2_DATA_PATH}'), 'food100', 'label_names.pkl')
    label_names_pkl = open(label_names_pkl_path, 'rb')
    label_names = pickle.load(label_names_pkl)
    class_to_label = {l : i for i, l in enumerate(label_names)}
    label_names_pkl.close()

    print 'Loading valid set from: %s' % valid_path
    valid_set = [img for img in glob(valid_path) if is_included(img, class_to_superclass, id_to_class)]

    voters = get_voters(models_path)
    confidence_matrix = np.zeros((len(voters), len(label_names)), dtype=float)
    for i, voter in enumerate(voters):
        cm = get_confusion_matrix(len(label_names), id_to_class, class_to_superclass, class_to_label, valid_set, voter, batch)
        print 'model#%d... misclass rate: %f' % (i, get_misclass(cm))
        cm /= np.sum(cm, axis=0)
        confidence_matrix[i] = cm.diagonal()
Example #58
0
def show(image):
    """
    .. todo::

        WRITEME

    Parameters
    ----------
    image : PIL Image object or ndarray
        If ndarray, integer formats are assumed to use 0-255
        and float formats are assumed to use 0-1
    """
    viewer_command = string.preprocess('${PYLEARN2_VIEWER_COMMAND}')

    if viewer_command == 'inline':
        return imview(image)

    if hasattr(image, '__array__'):
        # do some shape checking because PIL just raises a tuple indexing error
        # that doesn't make it very clear what the problem is
        if len(image.shape) < 2 or len(image.shape) > 3:
            raise ValueError('image must have either 2 or 3 dimensions but its'
                             ' shape is ' + str(image.shape))

        # The below is a temporary workaround that prevents us from crashing
        # 3rd party image viewers such as eog by writing out overly large
        # images.
        # In the long run we should determine if this is a bug in PIL when
        # producing
        # such images or a bug in eog and determine a proper fix.
        # Since this is hopefully just a short term workaround the
        # constants below are not included in the interface to the
        # function, so that 3rd party code won't start passing them.
        max_height = 4096
        max_width = 4096

        # Display separate warnings for each direction, since it's
        # common to crop only one.
        if image.shape[0] > max_height:
            image = image[0:max_height, :, :]
            warnings.warn("Cropping image to smaller height to avoid crashing "
                          "the viewer program.")
        if image.shape[0] > max_width:
            image = image[:, 0:max_width, :]
            warnings.warn("Cropping the image to a smaller width to avoid "
                          "crashing the viewer program.")
        # This ends the workaround

        if image.dtype == 'int8':
            image = np.cast['uint8'](image)
        elif str(image.dtype).startswith('float'):
            # don't use *=, we don't want to modify the input array
            image = image * 255.
            image = np.cast['uint8'](image)

        # PIL is too stupid to handle single-channel arrays
        if len(image.shape) == 3 and image.shape[2] == 1:
            image = image[:, :, 0]

        try:
            ensure_Image()
            image = Image.fromarray(image)
        except TypeError:
            reraise_as(TypeError("PIL issued TypeError on ndarray of shape " +
                                 str(image.shape) + " and dtype " +
                                 str(image.dtype)))

    # Create a temporary file with the suffix '.png'.
    fd, name = mkstemp(suffix='.png')
    os.close(fd)

    # Note:
    #   Although we can use tempfile.NamedTemporaryFile() to create
    #   a temporary file, the function should be used with care.
    #
    #   In Python earlier than 2.7, a temporary file created by the
    #   function will be deleted just after the file is closed.
    #   We can re-use the name of the temporary file, but there is an
    #   instant where a file with the name does not exist in the file
    #   system before we re-use the name. This may cause a race
    #   condition.
    #
    #   In Python 2.7 or later, tempfile.NamedTemporaryFile() has
    #   the 'delete' argument which can control whether a temporary
    #   file will be automatically deleted or not. With the argument,
    #   the above race condition can be avoided.
    #

    image.save(name)
    if os.name == 'nt':
        subprocess.Popen(viewer_command + ' ' + name + ' && del ' + name,
                         shell=True)
    else:
        subprocess.Popen(viewer_command + ' ' + name + ' ; rm ' + name,
                         shell=True)
Example #59
0
def main():
    data_dir = string.preprocess('${PYLEARN2_DATA_PATH}/stl10')

    print('Loading STL-10 unlabeled and train datasets...')
    downsampled_dir = data_dir + '/stl10_32x32'

    data = serial.load(downsampled_dir + '/unlabeled.pkl')
    supplement = serial.load(downsampled_dir + '/train.pkl')

    print('Concatenating datasets...')
    data.set_design_matrix(np.concatenate((data.X, supplement.X), axis=0))

    print("Preparing output directory...")
    output_dir = data_dir + '/stl10_32x32_whitened'
    serial.mkdir(output_dir)
    README = open(output_dir + '/README', 'w')

    README.write(
        textwrap.dedent("""
    The .pkl files in this directory may be opened in python using
    cPickle, pickle, or pylearn2.serial.load.

    unsupervised.pkl, unlabeled.pkl, train.pkl, and test.pkl each contain
    a pylearn2 Dataset object defining an unlabeled
    dataset of a 32x32 approximately whitened version of the STL-10
    dataset. unlabeled.pkl contains unlabeled train examples. train.pkl
    contains labeled train examples. unsupervised.pkl contains the union
    of these (without any labels). test.pkl contains the labeled test
    examples.

    preprocessor.pkl contains a pylearn2 ZCA object that was used
    to approximately whiten the images. You may want to use this
    object later to preprocess other images.

    They were created with the pylearn2 script make_stl10_whitened.py.

    All other files in this directory, including this README, were
    created by the same script and are necessary for the other files
    to function correctly.
    """))

    README.close()

    print("Learning the preprocessor \
          and preprocessing the unsupervised train data...")
    preprocessor = preprocessing.ZCA()
    data.apply_preprocessor(preprocessor=preprocessor, can_fit=True)

    print('Saving the unsupervised data')
    data.use_design_loc(output_dir + '/unsupervised.npy')
    serial.save(output_dir + '/unsupervised.pkl', data)

    X = data.X
    unlabeled = X[0:100 * 1000, :]
    labeled = X[100 * 1000:, :]
    del X

    print("Saving the unlabeled data")
    data.X = unlabeled
    data.use_design_loc(output_dir + '/unlabeled.npy')
    serial.save(output_dir + '/unlabeled.pkl', data)
    del data
    del unlabeled

    print("Saving the labeled train data")
    supplement.X = labeled
    supplement.use_design_loc(output_dir + '/train.npy')
    serial.save(output_dir + '/train.pkl', supplement)
    del supplement
    del labeled

    print("Loading the test data")
    test = serial.load(downsampled_dir + '/test.pkl')

    print("Preprocessing the test data")
    test.apply_preprocessor(preprocessor=preprocessor, can_fit=False)

    print("Saving the test data")
    test.use_design_loc(output_dir + '/test.npy')
    serial.save(output_dir + '/test.pkl', test)

    serial.save(output_dir + '/preprocessor.pkl', preprocessor)
Example #60
0
def load(filepath, recurse_depth=0, retry=True):
    """
    Parameters
    ----------
    filepath : str
        A path to a file to load. Should be a pickle, Matlab, or NumPy
        file; or a .txt or .amat file that numpy.loadtxt can load.
    recurse_depth : int
        End users should not use this argument. It is used by the function
        itself to implement the `retry` option recursively.
    retry : bool
        If True, will make a handful of attempts to load the file before
        giving up. This can be useful if you are for example calling
        show_weights.py on a file that is actively being written to by a
        training script--sometimes the load attempt might fail if the
        training script writes at the same time show_weights tries to
        read, but if you try again after a few seconds you should be able
        to open the file.

    Returns
    -------
    loaded_object : object
        The object that was stored in the file.

    ..todo

        Refactor to hide recurse_depth from end users
    """
    try:
        import joblib
        joblib_available = True
    except ImportError:
        joblib_available = False
    if recurse_depth == 0:
        filepath = preprocess(filepath)

    if filepath.endswith('.npy') or filepath.endswith('.npz'):
        return np.load(filepath)

    if filepath.endswith('.amat') or filepath.endswith('txt'):
        try:
            return np.loadtxt(filepath)
        except Exception:
            logger.exception("{0} cannot be loaded by serial.load (trying to"
                             " use np.loadtxt)".format(filepath))
            raise

    if filepath.endswith('.mat'):
        global io
        if io is None:
            import scipy.io
            io = scipy.io
        try:
            return io.loadmat(filepath)
        except NotImplementedError, nei:
            if str(nei).find('HDF reader') != -1:
                global hdf_reader
                if hdf_reader is None:
                    import h5py
                    hdf_reader = h5py
                return hdf_reader.File(filepath)
            else:
                raise
        #this code should never be reached
        assert False