Beispiel #1
0
def train_nice(args):
    vn = True
    center = True
    if args.transposed:
        fmri = MRI.MRI_Transposed(dataset_name=args.dataset_name,
                                  even_input=True)
        input_dim = fmri.X.shape[1]
        del fmri
    else:
        data_path = serial.preprocess("${PYLEARN2_NI_PATH}/" + args.dataset_name)
        mask_file = path.join(data_path, "mask.npy")
        mask = np.load(mask_file)
        input_dim = (mask == 1).sum()
        if input_dim % 2 == 1:
            input_dim -= 1

    logging.info("Input shape: %d" % input_dim)

    p = path.abspath(path.dirname(__file__))
    yaml_file = path.join(p, "nice_%s.yaml" % args.dataset_name)
    user = path.expandvars("$USER")
    save_file = "nice_%s%s%s" % (args.dataset_name,
                                 "_transposed" if args.transposed else "",
                                 "_logistic" if args.logistic else "")
    save_path = serial.preprocess("/export/mialab/users/%s/pylearn2_outs/%s"
                                  % (user, save_file))
    variance_map_file = path.join(data_path, "variance_map.npy")
    if not path.isfile(variance_map_file):
        raise ValueError("Variance map file %s not found."
                         % variance_map_file)
    train(yaml_file, save_path, input_dim,
          args.transposed, args.logistic, variance_map_file)
Beispiel #2
0
def test_rbm():
    save_path = path.join(serial.preprocess("${PYLEARN2_OUTS}"), "tutorials")
    if not path.isdir(serial.preprocess("${PYLEARN2_OUTS}")):
        raise IOError("PYLEARN2_OUTS environment variable not set")
    train_rbm.train_rbm(epochs = 1, save_path=save_path)
    mri_analysis.main(path.join(save_path, "rbm_smri.pkl"),
                      save_path, "sz_t")
Beispiel #3
0
def main(args):
    dataset_name = args.dataset_name

    logger.info("Getting dataset info for %s" % dataset_name)
    data_path = serial.preprocess("${PYLEARN2_NI_PATH}/" + dataset_name)
    mask_file = path.join(data_path, "mask.npy")
    mask = np.load(mask_file)
    input_dim = (mask == 1).sum()

    user = path.expandvars("$USER")
    save_path = serial.preprocess("/export/mialab/users/%s/pylearn2_outs/%s"
                                  % (user, "rbm_simple_test"))

    # File parameters are path specific ones (not model specific).
    file_params = {"save_path": save_path,
                   }

    yaml_template = open(yaml_file).read()
    hyperparams = expand(flatten(experiment.default_hyperparams(input_dim=input_dim)),
                         dict_type=ydict)

    # Set additional hyperparams from command line args
    if args.learning_rate is not None:
        hyperparams["learning_rate"] = args.learning_rate
    if args.batch_size is not None:
        hyperparams["batch_size"] = args.batch_size

    for param in file_params:
        yaml_template = yaml_template.replace("%%(%s)s" % param, file_params[param])

    yaml = yaml_template % hyperparams

    logger.info("Training")
    train = yaml_parse.load(yaml)
    train.main_loop()
Beispiel #4
0
def train_nice(args):
    vn = True
    center = True
    logger.info("Getting dataset info for %s" % args.dataset_name)
    data_path = serial.preprocess("${PYLEARN2_NI_PATH}/" + args.dataset_name)
    if args.transposed:
        logger.info("Data in transpose...")
        mri = MRI.MRI_Transposed(dataset_name=args.dataset_name,
                                 unit_normalize=True,
                                 even_input=True,
                                 apply_mask=True)
        input_dim = mri.X.shape[1]
        variance_map_file = path.join(data_path, "transposed_variance_map.npy")
    else:
        mask_file = path.join(data_path, "mask.npy")
        mask = np.load(mask_file)
        input_dim = (mask == 1).sum()
        if input_dim % 2 == 1:
            input_dim -= 1
        mri = MRI.MRI_Standard(which_set="full",
                               dataset_name=args.dataset_name,
                               unit_normalize=True,
                               even_input=True,
                               apply_mask=True)
        variance_map_file = path.join(data_path, "variance_map.npy")
    save_variance_map(mri, variance_map_file)

    logger.info("Input shape: %d" % input_dim)

    p = path.abspath(path.dirname(__file__))
    yaml_file = path.join(p, "nice_mri.yaml")
    user = path.expandvars("$USER")

    if args.out_name is not None:
        out_name = args.out_name
    else:
        out_name = args.dataset_name
    save_file = "nice_%s%s%s" % (out_name,
                                 "_transposed" if args.transposed else "",
                                 "_logistic" if args.logistic else "")
    save_path = serial.preprocess("/export/mialab/users/%s/pylearn2_outs/%s"
                                  % (user, save_file))
    if path.isfile(save_path + ".pkl") or path.isfile(save_path + "_best.pkl"):
        answer = None
        while answer not in ["Y", "N", "y", "n"]:
            answer = raw_input("%s already exists, continuing will overwrite."
                               "\nOverwrite? (Y/N)[N]: " % save_path) or "N"
            if answer not in ["Y", "N", "y", "n"]:
                print "Please answer Y or N"
        if answer in ["N", "n"]:
            print "If you want to run without overwrite, consider using the -o option."
            sys.exit()

    logger.info("Saving to prefix %s" % save_path)

    if not path.isfile(variance_map_file):
        raise ValueError("Variance map file %s not found."
                         % variance_map_file)
    train(yaml_file, save_path, input_dim,
          args.transposed, args.logistic, variance_map_file, args.dataset_name)
Beispiel #5
0
def test_rbm():
    save_path = path.join(serial.preprocess("${PYLEARN2_OUTS}"), "tutorials")
    if not path.isdir(serial.preprocess("${PYLEARN2_OUTS}")):
        raise IOError("PYLEARN2_OUTS environment variable not set")

    train_rbm.train_rbm(epochs = 1, save_path=save_path)
    show_weights.show_weights(path.join(save_path, "rbm_mnist.pkl"),
                              out=path.join(save_path, "rbm_mnist_weights.png"))
Beispiel #6
0
    def load_aod_gts(self):
        p = path.join(self.dataset_root, "aod_extra/")

        if not(path.isdir(serial.preprocess(p))):
            raise IOError("AOD extras directory %s not found."
                          % serial.preprocess(p))

        targets = np.load(serial.preprocess(p + "targets.npy"))
        novels = np.load(serial.preprocess(p + "novels.npy"))
        return targets, novels
    def __init__(self,
                 which_set,
                 data_path=None,
                 center=True,
                 rescale=True,
                 gcn=True):
        self.class_name = ['neg', 'pos']
        # load data
        path = "${PYLEARN2_DATA_PATH}/cin/"
        #datapath = path + 'feature850-2-1.pkl'
        if data_path is None:
            data_path = path + 'feature850-2-1.pkl'
        else:
            data_path = path + data_path
        data_path = serial.preprocess(data_path)
        with  open(data_path, 'rb') as f:
            #f = open(datapath, 'rb')
            train_set, valid_set, test_set = cPickle.load(f)
            #f.close()

        self.train_set = train_set
        self.valid_set = valid_set
        self.test_set = test_set
        if which_set == 'train':
            X, Y = self.train_set
        elif which_set == 'valid':
            X, Y = self.valid_set
        else:
            X, Y = self.test_set

        X.astype(float)
        axis = 0
        _max = np.max(X, axis=axis)
        _min = np.min(X, axis=axis)
        _mean = np.mean(X, axis=axis)
        _std = np.std(X, axis=axis)
        _scale = _max - _min


        # print _max
        # print _min
        # print _mean
        # print _std

        if gcn:
            X = global_contrast_normalize(X, scale=gcn)
        else:
            if center:
                X[:, ] -= _mean
            if rescale:
                X[:, ] /= _scale

        # topo_view = X.reshape(X.shape[0], X.shape[1], 1, 1)
        # y = np.reshape(Y, (Y.shape[0], 1))
        # y = np.atleast_2d(Y).T
        y = np.zeros((Y.shape[0], 2))
        y[:, 0] = Y
        y[:, 0] = 1 - Y
        print X.shape, y.shape
        super(CIN_FEATURE2, self).__init__(X=X, y=y)
Beispiel #8
0
    def __init__(self, jobs, db, name, updater, analyzer, alerter, reload=False):
        self.__dict__.update(locals())

        self.table_dir = serial.preprocess(path.join(args.out_dir,
                                                     self.name))
        self.html = HTMLPage(self.name + " results")

        self.analyzer.start()
        self.updater.start()
 def getFilename(i):
     base = path+'snapshot_'
     if i<10:
         out= base+'00%d.hdf5'%i
     elif i<100:
         out= base+'0%d.hdf5'%i
     else:
         out= base+'%d.hdf5'%i
     return serial.preprocess(out)
Beispiel #10
0
def test_data():
    pylearn2_out_path = path.expandvars("$PYLEARN2_OUTS")
    assert pylearn2_out_path != "", ("PYLEARN2_OUTS environment variable is "
                                     "not set.")

    pylearn2_data_path = path.expandvars("$PYLEARN2_NI_PATH")
    assert pylearn2_data_path != "", ("PYLEARN2_NI_PATH environment"
                                      " variable is not set")

    data_path = serial.preprocess("${PYLEARN2_NI_PATH}/smri/")
    extras_path = serial.preprocess("${PYLEARN2_NI_PATH}/mri_extra/")

    try:
        assert path.isdir(data_path), data_path
        assert path.isdir(extras_path), extras_path
    except AssertionError as e:
        raise IOError("File or directory not found (%s), did you set your "
                      "PYLEARN2_NI_PATH correctly? (%s)" % (e, data_path))
Beispiel #11
0
    def __init__(self, which_set, start=None, stop=None, shuffle=False):
        if which_set not in ['train', 'valid']:
            if which_set == 'test':
                raise ValueError(
                    "Currently test datasets not supported")
            raise ValueError(
                'Unrecognized which_set value "%s".' % (which_set,) +
                '". Valid values are ["train","valid"].')

        p = "${PYLEARN2_NI_PATH}/snp/"
        if which_set == 'train':
            data_path = p + 'gen.chr1.npy'
            label_path = p + 'gen.chr1_labels.npy'
        else:
            assert which_set == 'test'
            data_path = p + 'test.npy'
            label_path = p + 'test_labels.npy'

        data_path = serial.preprocess(data_path)
        label_path = serial.preprocess(label_path)

        print "Loading data"
        topo_view = np.load(data_path)
        y = np.atleast_2d(np.load(label_path)).T
        samples, number_snps = topo_view.shape

        if start is not None:
            stop = stop if (stop <= samples) else samples
            assert 0 <= start < stop
            topo_view = topo_view[start:stop, :]
            y = y[start:stop]

        if shuffle:
            self.shuffle_rng = make_np_rng(None, default_seed=[1, 2, 3], which_method="shuffle")
            for i in xrange(samples):
                j = self.shuffle_rng.randint(samples)
                tmp = topo_view[i].copy()
                topo_view[i] = topo_view[j]
                topo_view[j] = tmp
                tmp = y[i,i+1].copy()
                y[i] = y[j]
                y[j] = tmp

        super(SNP, self).__init__(X=topo_view, y=y, y_labels=np.amax(y)+1)
Beispiel #12
0
    def __init__(self, which_set, one_hot=False, axes=['b', 0, 1, 'c']):
        """
        .. todo::

            WRITEME
        """
        self.args = locals()

        assert which_set in self.data_split.keys()

        path = serial.preprocess(
            "${PYLEARN2_DATA_PATH}/ocr_letters/letter.data")
        with open(path, 'r') as data_f:
            data = data_f.readlines()
            data = [line.split("\t") for line in data]

        data_x = [map(int, item[6:-1]) for item in data]
        data_letters = [item[1] for item in data]
        data_fold = [int(item[5]) for item in data]

        letters = list(numpy.unique(data_letters))
        data_y = [letters.index(item) for item in data_letters]

        if which_set == 'train':
            split = slice(0, self.data_split['train'])
        elif which_set == 'valid':
            split = slice(self.data_split['train'], self.data_split['train'] +
                          self.data_split['valid'])
        elif which_set == 'test':
            split = slice(self.data_split['train'] + self.data_split['valid'],
                          (self.data_split['train'] +
                           self.data_split['valid'] +
                           self.data_split['test']))

        data_x = numpy.asarray(data_x[split])
        data_y = numpy.asarray(data_y[split])
        data_fold = numpy.asarray(data_y[split])
        assert data_x.shape[0] == data_y.shape[0]
        assert data_x.shape[0] == self.data_split[which_set]

        self.one_hot = one_hot
        if one_hot:
            one_hot = numpy.zeros(
                (data_y.shape[0], len(letters)), dtype='float32')
            for i in xrange(data_y.shape[0]):
                one_hot[i, data_y[i]] = 1.
            data_y = one_hot

        view_converter = dense_design_matrix.DefaultViewConverter(
            (16, 8, 1), axes)
        super(OCR, self).__init__(
            X=data_x, y=data_y, view_converter=view_converter)

        assert not contains_nan(self.X)
        self.fold = data_fold
Beispiel #13
0
def main(dataset_name="smri"):
    logger.info("Getting dataset info for %s" % args.dataset_name)
    data_path = serial.preprocess("${PYLEARN2_NI_PATH}/" + args.dataset_name)
    mask_file = path.join(data_path, "mask.npy")
    mask = np.load(mask_file)
    input_dim = (mask == 1).sum()
    if input_dim % 2 == 1:
        input_dim -= 1
    mri = MRI.MRI_Standard(which_set="full",
                           dataset_name=args.dataset_name,
                           unit_normalize=True,
                           even_input=True,
                           apply_mask=True)
    variance_map_file = path.join(data_path, "variance_map.npy")
    mri_nifti.save_variance_map(mri, variance_map_file)

    user = path.expandvars("$USER")
    save_path = serial.preprocess("/export/mialab/users/%s/pylearn2_outs/%s"
                                  % (user, "jobman_test"))

    file_params = {"save_path": save_path,
                   "variance_map_file": variance_map_file
                   }

    yaml_template = open(yaml_file).read()
    hyperparams = expand(flatten(mlp_experiment.default_hyperparams(input_dim=input_dim)),
                         dict_type=ydict)

    for param in hyperparams:
        if hasattr(args, param) and getattr(args, param):
            val = getattr(args, param)
            logger.info("Filling %s with %r" % (param, val))
            hyperparams[param] = type(hyperparams[param])(val)

    for param in file_params:
        yaml_template = yaml_template.replace("%%(%s)s" % param, file_params[param])

    yaml = yaml_template % hyperparams
    print yaml
    logger.info("Training")
    train = yaml_parse.load(yaml)
    train.main_loop()
    def get_input_params(self, args, hyperparams):
        data_path = serial.preprocess("${PYLEARN2_NI_PATH}/" + args.dataset_name)

        data_class = hyperparams["data_class"]
        variance_normalize = hyperparams.get("variance_normalize", False)
        unit_normalize = hyperparams.get("unit_normalize", False)
        demean = hyperparams.get("demean", False)
        assert not (variance_normalize and unit_normalize)

        logger.info((data_class, variance_normalize, unit_normalize, demean))
        h = hash((data_class, variance_normalize, unit_normalize, demean))

        if self.d.get(h, False):
            return self.d[h]
        else:
            if data_class == "MRI_Transposed":
                assert not variance_normalize
                mri = MRI.MRI_Transposed(dataset_name=args.dataset_name,
                                         unit_normalize=unit_normalize,
                                         demean=demean,
                                         even_input=True,
                                         apply_mask=True)
                input_dim = mri.X.shape[1]
                variance_file_name = ("variance_map_transposed%s%s.npy"
                                      % ("_un" if unit_normalize else "",
                                         "_dm" if demean else ""))

            elif data_class == "MRI_Standard":
                assert not demean
                mask_file = path.join(data_path, "mask.npy")
                mask = np.load(mask_file)
                input_dim = (mask == 1).sum()
                if input_dim % 2 == 1:
                    input_dim -= 1
                mri = MRI.MRI_Standard(which_set="full",
                                       dataset_name=args.dataset_name,
                                       unit_normalize=unit_normalize,
                                       variance_normalize=variance_normalize,
                                       even_input=True,
                                       apply_mask=True)
                variance_file_name = ("variance_map%s%s.npy"
                                      % ("_un" if unit_normalize else "",
                                         "_vn" if variance_normalize else ""))
                logger.info(variance_file_name)
                logger.info((data_class, variance_normalize, unit_normalize, demean))

        variance_map_file = path.join(data_path, variance_file_name)
        if not path.isfile(variance_map_file):
            logger.info("Saving variance file %s" % variance_map_file)
            mri_nifti.save_variance_map(mri, variance_map_file)
        self.d[h] = (input_dim, variance_map_file)
        return self.d[h]
def train_nice():
    vn = True
    center = True
    smri = MRI.MRI_Transposed(dataset_name="smri",
                              even_input=True)
    input_dim = smri.X.shape[1]

    p = path.abspath(path.dirname(__file__))
    yaml_file = path.join(p, "nice_smri_transposed.yaml")
    user = path.expandvars("$USER")
    save_path = serial.preprocess("/export/mialab/users/%s/pylearn2_outs/" % user)
    assert path.isdir(save_path)
    train(yaml_file, save_path, input_dim)
    def loadall(self):
        datasets = []
        for i in range(self.foldn):
            filename = self.filestr.format(str(i + 1))
            filename = dirpath + filename
            filename = serial.preprocess(filename)
            print "load data file: " + filename
            self.loadi(i, filename=filename)

        dataset = datasets[0]
        X, y = datasetXy
        # print X.shape, y.shape

        return datasets
Beispiel #17
0
 def __init__(self, which_set='train', center=False, start=None, stop=None,
              axes=['b', 'c', 0, 1], preprocessor=None,
              fit_preprocessor=False, fit_test_preprocessor=False):
     self.shape = (8, 35, 57)
     self.size = {'train': 2849, 'valid': 2849, 'test': 2849}
     self.range = (-10, 10)
     self.path = "${PYLEARN2_DATA_PATH}/ecmwf/"
     self.set_path = {'train': 'ecmwf.train', 'valid': 'ecmwf.val', 'test': 'ecmwf.test'}
     self.args = locals()
     if which_set not in ['train', 'valid', 'test']:
         raise ValueError(
             'Unrecognized which_set value "%s".' % (which_set,) +
             '". Valid values are ["train","valid","test"].')
     path = self.path + self.set_path[which_set]
     if control.get_load_data():
         path = serial.preprocess(path)
         datasetCache = cache.datasetCache
         path = datasetCache.cache_file(path)
         X, topo_view, y = self._read_ecmwf(path, which_set)
     else:
         X = np.random.rand(self.size[which_set], np.prod(self.shape))
         topo_view = np.random.rand(self.size[which_set]*np.prod(self.shape))
         y = np.random.randint(self.range[0], self.range[1], (self.size[which_set], 1))
     (m, v, r, c) = topo_view.shape
     if center:
         topo_view -= topo_view.mean(axis=0)
     super(ECMWF, self).__init__(X=X, topo_view=topo_view, y=y, axes=axes)
     assert not np.any(np.isnan(self.X))
     if start is not None:
         assert start >= 0
         if stop > self.X.shape[0]:
             raise ValueError('stop=' + str(stop) + '>' +
                              'm=' + str(self.X.shape[0]))
         assert stop > start
         self.X = self.X[start:stop, :]
         if self.X.shape[0] != stop - start:
             raise ValueError("X.shape[0]: %d. start: %d stop: %d"
                              % (self.X.shape[0], start, stop))
         if len(self.y.shape) > 1:
             self.y = self.y[start:stop, :]
         else:
             self.y = self.y[start:stop]
         assert self.y.shape[0] == stop - start
     if which_set == 'test':
         assert fit_test_preprocessor is None or \
             (fit_preprocessor == fit_test_preprocessor)
     if self.X is not None and preprocessor:
         preprocessor.apply(self, fit_preprocessor)
Beispiel #18
0
def run_experiment(experiment, **kwargs):
    """
    Experiment function.
    Used by jobman to run jobs. Must be loaded externally.
    TODO: add sigint handling.

    Parameters
    ----------
    experiment: module
        Experiment module.
    kwargs: dict
        Typically hyperparameters.
    """

    hyper_parameters = experiment.default_hyperparams()
    set_hyper_parameters(hyper_parameters, **kwargs)
    file_parameters = experiment.fileparams
    set_hyper_parameters(file_parameters, **kwargs)
    hyper_parameters.update(file_parameters)

    ih = MRIInputHandler()
    input_dim, variance_map_file = ih.get_input_params(hyper_parameters)
    hyper_parameters["nvis"] = input_dim
    hyper_parameters["variance_map_file"] = variance_map_file

    pid = os.getpid()
    out_path = serial.preprocess(
        hyper_parameters.get("out_path", "${PYLEARN2_OUTS}"))
    if not path.isdir(out_path):
        os.mkdir(out_path)
    if not path.isdir(path.join(out_path, "logs")):
        os.mkdir(path.join(out_path, "logs"))

    hyper_parameters = expand(flatten(hyper_parameters), dict_type=ydict)

    lh = LogHandler(experiment, hyper_parameters, out_path, pid)
    h = logging.StreamHandler(lh)
    monitor.log.addHandler(h)

    yaml_template = open(experiment.yaml_file).read()
    yaml = yaml_template % hyper_parameters
    train_object = yaml_parse.load(yaml)
    try:
        train_object.main_loop()
        lh.finish("COMPLETED")
    except KeyboardInterrupt:
        print("Quitting...")
        lh.finish("KILLED")
    def loadall(self,
                dirpath="${PYLEARN2_DATA_PATH}/cin/",
                filestr="feature2086-2-{}.pkl",
                n=10):
        datasets = []
        for i in range(n):
            filename = filestr.format(str(i + 1))
            filename = dirpath + filename
            filename = serial.preprocess(filename)
            print "load data file: " + filename
            self.loadi(i, filename=filename)

        dataset = datasets[0]
        X, y = datasetXy
        print X.shape, y.shape

        return datasets
Beispiel #20
0
    def get_mask(self):
        """
        Get mask for dataset.

        Parameters
        ----------

        Returns
        -------
        mask: array-like
            4D array of 1 and 0 values.
        """
        p = path.join(self.dataset_root, self.dataset_name + "/")
        mask_path = serial.preprocess(p + "mask.npy")
        mask = np.load(mask_path)
        if not np.all(np.bitwise_or(mask == 0, mask == 1)):
            raise ValueError("Mask has incorrect values.")
        return mask
Beispiel #21
0
def test_data():
    pylearn2_data_path = path.expandvars("$PYLEARN2_DATA_PATH")
    assert pylearn2_data_path != "", ("PYLEARN2_DATA_PATH environment"
                                      " variable is not set")

    data_path = serial.preprocess("${PYLEARN2_DATA_PATH}/mnist/")
    try:
        assert path.isdir(data_path), data_path
        assert path.isfile(path.join(data_path, "t10k-images-idx3-ubyte")),\
            "t10k-images-idx3-ubyte"
        assert path.isfile(path.join(data_path, "t10k-labels-idx1-ubyte")),\
            "t10k-labels-idx1-ubyte"
        assert path.isfile(path.join(data_path, "train-images-idx3-ubyte")),\
            "train-images-idx3-ubyte"
        assert path.isfile(path.join(data_path, "train-labels-idx1-ubyte")),\
            "train-labels-idx1-ubyte"
    except AssertionError as e:
        raise IOError("File or directory not found (%s), did you set "
                      "PYLEARN2_DATA_PATH correctly? (%s)" % (e, data_path))
Beispiel #22
0
    def _load_path(self, which_set, which_targets, word2vec_dict={}):
        if which_targets not in ['fine', 'coarse']:
            raise ValueError(
                'Unrecognized which_set value "%s".' % (which_set,) +
                '". Valid values are ["fine","coarse"].')

        if which_set not in ['train', 'test']:
            raise ValueError(
                'Unrecognized which_set value "%s".' % (which_set,) +
                '". Valid values are ["train","test"].')

        if control.get_load_data():
            path = "${PYLEARN2_DATA_PATH}/TREC_question_type_data/"
            if which_set == 'train':
                data_path = path + 'trecqc.train_5500.label.txt'
            else:
                assert which_set == 'test'
                data_path = path + 'trecqc.test_500.label.txt'
            data_path = serial.preprocess(data_path)
        self.path = path
        return data_path
Beispiel #23
0
def train(yaml_file, save_path, epochs):
    yaml = open(yaml_file, "r").read()

    # Gets the input dimensionality from the mri mask
    data_path = serial.preprocess("${PYLEARN2_NI_PATH}/smri")
    logger.info("Loading data from %s" % data_path)
    mask_file = path.join(data_path, "mask.npy")
    mask = np.load(mask_file)
    input_dim = len(np.where(mask.flatten() == 1)[0].tolist())
    del mask

    # Fills in the blanks of the yaml file
    hyperparams = {"nvis": input_dim,
                   "batch_size": 5,
                   "detector_layer_dim": 64,
                   "monitoring_batches": 5,
                   "save_path": save_path,
                   "max_epochs": epochs
                  }
    yaml = yaml % hyperparams
    train_yaml(yaml)
def test_multimodal_dbn():

    skip.skip_if_no_data()

    yaml_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__),
                                                  '..'))
    save_path = serial.preprocess("${PYLEARN2_DATA_PATH}/experiment/multimodal_dbn/")

    train_image_layer1(yaml_file_path, save_path)
    train_image_layer2(yaml_file_path, save_path)
    train_text_layer1(yaml_file_path, save_path)
    train_text_layer2(yaml_file_path, save_path)
    
    get_representations_for_joint_layer(yaml_file_path, save_path, 100)
    train_joint_hidden_layer(yaml_file_path, save_path)
    
 #   train_mlp(yaml_file_path, save_path)

    try:
        os.remove(save_path +"image_rbm1.pkl".format(save_path))
        os.remove(save_path +"image_rbm2.pkl".format(save_path))
    except OSError:
        pass
Beispiel #25
0
    def __init__(self, name, which_set, image_format='png',
                image_converter='RGB'):

        if which_set not in ['train', 'test', 'valid']:
            raise ValueError(
                'Unrecognized which_set value "%s".' % (which_set,) +
                '". Valid values are ["train","test", "valid"].')

        data_path = serial.preprocess('${PYLEARN2_DATA_PATH}')
        image_path = os.path.join(data_path, name, which_set)

        classes = {}
        with open(os.path.join(data_path, name, 'classes.csv'), 'r') as f:
            reader = csv.DictReader(f)
            for row in reader:
                classes[row['image']] = int(row['class'])
        nb_classes = len(set(classes.values()))

        imgs = [img for img in os.listdir(image_path)
                    if img.endswith(image_format)]

        img = np.array(Image.open(os.path.join(image_path,
                        imgs[0])).convert(image_converter))
        data = np.zeros(shape=(len(imgs),
            img.shape[0],
            img.shape[1],
            img.shape[2] if len(img.shape) == 3 else 1))
        y = np.zeros(shape=(len(imgs), nb_classes))
        for i in range(0, len(imgs)):
            img = np.array(Image.open(os.path.join(image_path, imgs[i]))
                    .convert(image_converter))
            data[i] = img.reshape(img.shape[0],
                                    img.shape[1],
                                    img.shape[2] if len(img.shape) == 3 else 1)

            y[i][classes[imgs[i]]] = 1
        super(ImageDataset, self).__init__(topo_view=data, y=y)
Beispiel #26
0
    def resolve_dataset(self, which_set, dataset_name):
        """
        Resolve the dataset from the file directories.

        Parameters
        ----------
        which_set: str
            train, test, or full.

        """
        p = path.join(self.dataset_root, dataset_name + "/")

        if not(path.isdir(serial.preprocess(p))):
            raise IOError("MRI dataset directory %s not found."
                           % serial.preprocess(p))

        if which_set == "train":
            data_path = p + "train.npy"
            label_path = p + "train_labels.npy"
        elif which_set == "test":
            data_path = p + "test.npy"
            label_path = p + "test_labels.npy"
        else:
            if which_set != "full":
                raise ValueError("dataset \"%s\" not supported." % which_set)
            data_path = p + "full_unshuffled.npy"
            label_path = p + "full_labels_unshuffled.npy"
        nifti_path = p + "base.nii"

        data_path = serial.preprocess(data_path)
        label_path = serial.preprocess(label_path)
        try:
            self.base_nifti = load_image(serial.preprocess(nifti_path))
            logger.info("Loaded nifti")
        except IOError:
            self.base_nifti = None
            logger.warn("`base.nii` not in dataset directory. "
                        "You may need to reprocess.")

        if not(path.isfile(data_path)):
            raise ValueError("Dataset %s not found in %s"
                             % (which_set, serial.preprocess(p)))
        return data_path, label_path
Beispiel #27
0
    def make_data(self, which_set, p, center=False, variance_normalize=False,
                  shuffle=False, save_dummy=False):
        """
        Function to make h5 file.
        Note: parameters the same as __init__ function.
        """

        logger.info("Making h5 file for %s" % which_set)

        if which_set == "train":
            source_path = serial.preprocess(p + "train.npy")
            data_path = serial.preprocess(p + "train.h5")
            label_path = serial.preprocess(p + "train_labels.npy")
        else:
            assert which_set == "test"
            source_path = serial.preprocess(p + "test.npy")
            data_path = serial.preprocess(p + "test.h5")
            label_path = serial.preprocess(p + "test_labels.npy")

        data_path = "".join(data_path.split(".")[0] + "_dummy.h5")

        # Get the topological view and labels.
        topo_view = np.load(source_path)
        y = np.load(label_path)
        num_labels = np.amax(y) + 1

        # Shape information and mask.
        samples, rows, columns, depth = topo_view.shape
        if self.mask is not None:
            assert self.mask.shape == (rows, columns, depth)
            size = len(np.where(self.mask.flatten() == 1)[0].tolist())
        else:
            size = rows * columns * depth

        self.view_converter = MRIViewConverter((rows, columns, depth))
        X = self.view_converter.topo_view_to_design_mat(topo_view, self.mask)

        # TODO(dhjelm): one_hot is going away.
        one_hot = np.zeros((size, num_labels), dtype=config.floatX)
        for i in xrange(y.shape[0]):
            one_hot[i, y[i] - 1] = 1.

        if center:
            X -= X.mean(axis=0)

        if variance_normalize:
            X /= X.std(axis=0)

        rng = make_np_rng(None, 322, which_method="shuffle")
        if shuffle:
            index = range(samples)
            rng.shuffle(index)
            X = X[index, :]
            one_hot = one_hot[index, :]

        assert not np.any(np.isnan(X))

        h5file, node = self.init_hdf5(data_path,
                                      ([samples, size], [samples, num_labels]))
        MRI_Big.fill_hdf5(h5file, X, one_hot, node)
        h5file.close()
Beispiel #28
0
    def __init__(self, which_set, center=False, variance_normalize=False,
                 shuffle=False, apply_mask=False, preprocessor=None, dataset_name="smri",
                 reprocess=False, save_dummy=False):
        """
        Parameters
        ----------
        which_set: string
            "train" or "test"
        center: bool
            If True, then data -> data - data.mean()
        variance_normalize: True
            If True, then data -> data / data.std()
        shuffle: bool
            If True, then shuffle data when writing h5 (does nothing if not processing an h5).
        apply_mask: bool:
            If True, then the h5 file is masked with a mask file found in the data directory.
        preprocessor: not supported yet, TODO.
        dataset_name: string
            Dataset sub-directory name from ${PYLEARN_NI_PATH}
        reprocess: bool
            Some might want to reprocess the h5 file.
        save_dummy: bool
            Use a dummy file. This is for tests.
        """

        logger.warn("This class is deprecated and needs to be refactored.")
        if not path.isdir(serial.preprocess("${PYLEARN2_NI_PATH}")):
            raise ValueError("Did you set the PYLEARN_NI_PATH variable?")

        if which_set not in ["train", "test"]:
            if which_set == "valid":
                raise ValueError(
                    "Currently validation dataset is not supported with"
                    "sMRI.  This can be added in smri_nifti.py.")
            raise ValueError(
                "Unrecognized which_set value %s " % (which_set))

        self.__dict__.update(locals())
        del self.self

        p = "${PYLEARN2_NI_PATH}/%s/" % dataset_name
        assert path.isdir(p), ("No NI data directory called %s" % dataset_name)

        if which_set == "train":
            data_path = p + "train.h5"
        else:
            assert which_set == "test"
            data_path = p + "test.h5"

        # Dummy file is for tests, don"t want to resave over data we might actually be
        # using every time we run a test.
        if save_dummy:
            data_path = "".join(data_path.split(".")[0] + "_dummy.h5")

        data_path = serial.preprocess(data_path)

        # Load the mask file and retrieve shape information.
        self.mask = None
        mask_path = serial.preprocess(p + "mask.npy")
        if not path.isfile(mask_path):
            raise IOError("No mask found in %s."
                          "This file is needed to retrieve shape information."
                          "Are you sure this is a MRI dataset?" % mask_path)
        mask = np.load(mask_path)
        rows, columns, depth = mask.shape
        if apply_mask:
            self.mask = mask

        # Make the h5 file if not present or if reprocess flag is set.
        if not os.path.isfile(data_path) or reprocess:
            self.filters = tables.Filters(complib="blosc", complevel=5)
            self.make_data(which_set, serial.preprocess(p),
                           center=center,
                           variance_normalize=variance_normalize,
                           shuffle=shuffle, save_dummy=save_dummy)

        self.h5file = tables.openFile(data_path)
        data = self.h5file.getNode("/", "Data")
        view_converter = MRIViewConverter((rows, columns, depth))

        super(MRI_Big, self).__init__(X=data.X, y=data.y,
                                      view_converter=view_converter)

        self.h5file.flush()
Beispiel #29
0
    def __init__(self,
                 which_set,
                 center=False,
                 shuffle=False,
                 one_hot=False,
                 binarize=False,
                 start=None,
                 stop=None,
                 axes=['b', 0, 1, 'c'],
                 preprocessor=None,
                 fit_preprocessor=False,
                 fit_test_preprocessor=False):

        self.args = locals()

        if which_set not in ['train', 'test']:
            if which_set == 'valid':
                raise ValueError(
                    "There is no such thing as the MNIST "
                    "validation set. MNIST consists of 60,000 train examples and 10,000 test"
                    " examples. If you wish to use a validation set you should divide the train "
                    "set yourself. The pylearn2 dataset implements and will only ever implement "
                    "the standard train / test split used in the literature.")
            raise ValueError('Unrecognized which_set value "%s".' %
                             (which_set, ) +
                             '". Valid values are ["train","test"].')

        def dimshuffle(b01c):
            default = ('b', 0, 1, 'c')
            return b01c.transpose(*[default.index(axis) for axis in axes])

        if control.get_load_data():
            path = "${PYLEARN2_DATA_PATH}/mnist/"
            if which_set == 'train':
                im_path = path + 'train-images-idx3-ubyte'
                label_path = path + 'train-labels-idx1-ubyte'
            else:
                assert which_set == 'test'
                im_path = path + 't10k-images-idx3-ubyte'
                label_path = path + 't10k-labels-idx1-ubyte'
            # Path substitution done here in order to make the lower-level
            # mnist_ubyte.py as stand-alone as possible (for reuse in, e.g.,
            # the Deep Learning Tutorials, or in another package).
            im_path = serial.preprocess(im_path)
            label_path = serial.preprocess(label_path)
            topo_view = read_mnist_images(im_path, dtype='float32')
            y = read_mnist_labels(label_path)

            if binarize:
                topo_view = (topo_view > 0.5).astype('float32')

            self.one_hot = one_hot
            if one_hot:
                one_hot = N.zeros((y.shape[0], 10), dtype='float32')
                for i in xrange(y.shape[0]):
                    one_hot[i, y[i]] = 1.
                y = one_hot

            m, r, c = topo_view.shape
            assert r == 28
            assert c == 28
            topo_view = topo_view.reshape(m, r, c, 1)

            if which_set == 'train':
                assert m == 60000
            elif which_set == 'test':
                assert m == 10000
            else:
                assert False

            if center:
                topo_view -= topo_view.mean(axis=0)

            if shuffle:
                self.shuffle_rng = np.random.RandomState([1, 2, 3])
                for i in xrange(topo_view.shape[0]):
                    j = self.shuffle_rng.randint(m)
                    # Copy ensures that memory is not aliased.
                    tmp = topo_view[i, :, :, :].copy()
                    topo_view[i, :, :, :] = topo_view[j, :, :, :]
                    topo_view[j, :, :, :] = tmp
                    # Note: slicing with i:i+1 works for both one_hot=True/False.
                    tmp = y[i:i + 1].copy()
                    y[i] = y[j]
                    y[j] = tmp

            super(MNIST, self).__init__(topo_view=dimshuffle(topo_view),
                                        y=y,
                                        axes=axes)

            assert not N.any(N.isnan(self.X))

            if start is not None:
                assert start >= 0
                if stop > self.X.shape[0]:
                    raise ValueError('stop=' + str(stop) + '>' + 'm=' +
                                     str(self.X.shape[0]))
                assert stop > start
                self.X = self.X[start:stop, :]
                if self.X.shape[0] != stop - start:
                    raise ValueError("X.shape[0]: %d. start: %d stop: %d" %
                                     (self.X.shape[0], start, stop))
                if len(self.y.shape) > 1:
                    self.y = self.y[start:stop, :]
                else:
                    self.y = self.y[start:stop]
                assert self.y.shape[0] == stop - start
        else:
            # data loading is disabled, just make something that defines the
            # right topology
            topo = dimshuffle(np.zeros((1, 28, 28, 1)))
            super(MNIST, self).__init__(topo_view=topo, axes=axes)
            self.X = None

        if which_set == 'test':
            assert fit_test_preprocessor is None or (fit_preprocessor
                                                     == fit_test_preprocessor)

        if self.X is not None and preprocessor:
            preprocessor.apply(self, fit_preprocessor)
Beispiel #30
0
    def __init__(self,
                 which_set,
                 center=False,
                 shuffle=False,
                 one_hot=None,
                 binarize=False,
                 start=None,
                 stop=None,
                 axes=['b', 0, 1, 'c'],
                 preprocessor=None,
                 fit_preprocessor=False,
                 fit_test_preprocessor=False):

        self.args = locals()

        if which_set not in ['train', 'test']:
            if which_set == 'valid':
                raise ValueError(
                    "There is no such thing as the MNIST validation set. MNIST"
                    "consists of 60,000 train examples and 10,000 test"
                    "examples. If you wish to use a validation set you should"
                    "divide the train set yourself. The pylearn2 dataset"
                    "implements and will only ever implement the standard"
                    "train / test split used in the literature.")
            raise ValueError('Unrecognized which_set value "%s".' %
                             (which_set, ) +
                             '". Valid values are ["train","test"].')

        def dimshuffle(b01c):
            default = ('b', 0, 1, 'c')
            return b01c.transpose(*[default.index(axis) for axis in axes])

        if control.get_load_data():
            path = "${PYLEARN2_DATA_PATH}/mnist/"
            if which_set == 'train':
                im_path = path + 'train-images-idx3-ubyte'
                label_path = path + 'train-labels-idx1-ubyte'
            else:
                assert which_set == 'test'
                im_path = path + 't10k-images-idx3-ubyte'
                label_path = path + 't10k-labels-idx1-ubyte'
            # Path substitution done here in order to make the lower-level
            # mnist_ubyte.py as stand-alone as possible (for reuse in, e.g.,
            # the Deep Learning Tutorials, or in another package).
            im_path = serial.preprocess(im_path)
            label_path = serial.preprocess(label_path)

            # Locally cache the files before reading them
            datasetCache = cache.datasetCache
            im_path = datasetCache.cache_file(im_path)
            label_path = datasetCache.cache_file(label_path)

            topo_view = read_mnist_images(im_path, dtype='float32')
            y = read_mnist_labels(label_path)

            if binarize:
                topo_view = (topo_view > 0.5).astype('float32')

            max_labels = 10
            if one_hot is not None:
                warnings.warn(
                    "the `one_hot` parameter is deprecated. To get "
                    "one-hot encoded targets, request that they "
                    "live in `VectorSpace` through the `data_specs` "
                    "parameter of MNIST's iterator method. "
                    "`one_hot` will be removed on or after "
                    "September 20, 2014.",
                    stacklevel=2)

            m, r, c = topo_view.shape
            assert r == 28
            assert c == 28
            topo_view = topo_view.reshape(m, r, c, 1)

            if which_set == 'train':
                assert m == 60000
            elif which_set == 'test':
                assert m == 10000
            else:
                assert False

            if center:
                topo_view -= topo_view.mean(axis=0)

            if shuffle:
                self.shuffle_rng = make_np_rng(None, [1, 2, 3],
                                               which_method="shuffle")
                for i in xrange(topo_view.shape[0]):
                    j = self.shuffle_rng.randint(m)
                    # Copy ensures that memory is not aliased.
                    tmp = topo_view[i, :, :, :].copy()
                    topo_view[i, :, :, :] = topo_view[j, :, :, :]
                    topo_view[j, :, :, :] = tmp
                    # Note: slicing with i:i+1 works for one_hot=True/False
                    tmp = y[i:i + 1].copy()
                    y[i] = y[j]
                    y[j] = tmp

            super(MNIST, self).__init__(topo_view=dimshuffle(topo_view),
                                        y=y,
                                        axes=axes,
                                        max_labels=max_labels)

            assert not N.any(N.isnan(self.X))

            if start is not None:
                assert start >= 0
                if stop > self.X.shape[0]:
                    raise ValueError('stop=' + str(stop) + '>' + 'm=' +
                                     str(self.X.shape[0]))
                assert stop > start
                self.X = self.X[start:stop, :]
                if self.X.shape[0] != stop - start:
                    raise ValueError("X.shape[0]: %d. start: %d stop: %d" %
                                     (self.X.shape[0], start, stop))
                if len(self.y.shape) > 1:
                    self.y = self.y[start:stop, :]
                else:
                    self.y = self.y[start:stop]
                assert self.y.shape[0] == stop - start
        else:
            # data loading is disabled, just make something that defines the
            # right topology
            topo = dimshuffle(np.zeros((1, 28, 28, 1)))
            super(MNIST, self).__init__(topo_view=topo, axes=axes)
            self.X = None

        if which_set == 'test':
            assert fit_test_preprocessor is None or \
                (fit_preprocessor == fit_test_preprocessor)

        if self.X is not None and preprocessor:
            preprocessor.apply(self, fit_preprocessor)
Beispiel #31
0
def main(model, out_path=None, prefix=None, **anal_args):
    """
    Main function of module.
    This function controls the high end analysis functions.

    Parameters
    ----------
    model: Pylearn2.Model or str
        Model instance or path for the model.
    out_path: str, optional
        Path for the output directory.
    prefix: str, optional
        If provided, prefix for all output files.
    dataset_root: str, optional
        If provided, use as the root dir for dataset extraction.
    anal_args: dict
        argparse arguments (defined below).
    """

    if out_path is None and prefix is None and isinstance(model, str):
        prefix = ".".join(path.basename(model).split(".")[:-1])
        sm_prefix = prefix
        nifti_prefix = prefix
    else:
        nifti_prefix = "image"

    if out_path is None:
        assert isinstance(model, str), ("If you provide a model object, you "
                                        "must provide an out_path")
        out_path = path.abspath(path.dirname(model))

    if isinstance(model, str):
        logger.info("Loading model from %s" % model)
        model = serial.load(model)

    if not path.isdir(out_path):
        os.mkdir(out_path)

    logger.info("Getting features")
    feature_dict = fe.extract_features(model, **anal_args)
    dataset = feature_dict.pop("dataset")
    if isinstance(dataset, TransformerDataset):
        dataset = dataset.raw

    ms = fe.ModelStructure(model, dataset)
    data_path = serial.preprocess(dataset.dataset_root + dataset.dataset_name)
    sim_dict_file = path.join(data_path, "sim_dict.pkl")
    sim_dict = pickle.load(open(sim_dict_file, "r"))
    analyze_ground_truth(feature_dict, sim_dict, dataset)

    anal_dict = dict()

    mask = dataset.get_mask()
    feature_dict["mask"] = fe.Features(np.array([mask]), np.array([[0]]),
                                       name="mask")

    if isinstance(dataset, MRI.MRI_Transposed):
        samples = dataset.X[:, :20].T
    else:
        samples = dataset.X[:20]

    feature_dict["samples"] = fe.Features(samples, np.array([[0] * 20]).T,
                                          name="samples")

    if isinstance(dataset, MRI.MRI_Transposed):
        mean_image = dataset.X.mean(axis=1).T
    else:
        mean_image = dataset.X.mean(axis=0)

    feature_dict["mean_image"] = fe.Features(np.array([mean_image]),
                                             np.array([[0]]).T,
                                             name="mean image")

    if dataset.variance_map is not None:
        variance_map = dataset.variance_map[1]
        feature_dict["variance_map"] = fe.Features(np.array([variance_map]),
                                                   np.array([[0]]).T,
                                                   name="variance map")

    for name, features in feature_dict.iteritems():
        image_dir = path.join(out_path, "%s_images" % name)
        if not path.isdir(image_dir):
            os.mkdir(image_dir)
        save_simtb_spatial_maps(dataset, features, image_dir)

        features.set_histograms(tolist=True)
        fds = dict()
        for k, f in features.f.iteritems():
            fd = dict(
                image=path.join("%s_images" % name, "%d.png" % f.id),
                image_type="simtb",
                index=f.id,
                hists=f.hists,
                match_indices=f.match_indices
            )
            fd.update(**f.stats)

            fds[k] = fd

        anal_dict[name] = dict(
            name=name,
            image_dir=image_dir,
            features=fds
        )

    json_file = path.join(out_path, "analysis.json")
    with open(json_file, "w") as f:
        json.dump(anal_dict, f)

    logger.info("Done.")
Beispiel #32
0
def train(yaml_file, save_path, epochs):
    yaml = open(yaml_file, "r").read()
    input_dim = 784 # MNIST input size

    # Fills in the blanks of the yaml file
    hyperparams = {"nvis": input_dim,
                    "batch_size": 50,
                    "detector_layer_dim": 200,
                    "monitoring_batches": 10,
                    "train_stop": 50000,
                    "max_epochs": epochs,
                    "save_path": save_path
                  }
    yaml = yaml % hyperparams
    train_yaml(yaml)

def train_rbm(epochs = 300, save_path=None):
    # Load the yaml file
    yaml_file = path.join(path.abspath(path.dirname(__file__)), "rbm.yaml")
    if save_path is None:
        save_path = path.abspath(path.dirname(__file__))
    train(yaml_file, save_path, epochs)

if __name__ == "__main__":
    save_path = path.join(serial.preprocess("${PYLEARN2_OUTS}"), "tutorials")
    if not path.isdir(serial.preprocess("${PYLEARN2_OUTS}")):
        raise IOError("PYLEARN2_OUTS environment variable not set")
    if not path.isdir(save_path):
        os.mkdir(save_path)
    train_rbm(save_path=save_path)
Beispiel #33
0
    def __init__(self,
                 which_set,
                 imgd=65,
                 zd=1,
                 ds=1,
                 center=False,
                 shuffle=False,
                 one_hot=False,
                 binarize=False,
                 start=None,
                 stop=None,
                 axes=['b', 0, 1, 'c'],
                 preprocessor=None,
                 fit_preprocessor=False,
                 fit_test_preprocessor=False):

        self.args = locals()

        if which_set not in ['train', 'valid', 'test']:
            raise ValueError('Unrecognized which_set value "%s".' %
                             (which_set, ) +
                             '". Valid values are ["train","valid",test"].')

        def dimshuffle(b01c):
            default = ('b', 0, 1, 'c')
            return b01c.transpose(*[default.index(axis) for axis in axes])

        if control.get_load_data():

            path = "${PYLEARN2_DATA_PATH}/lgn/"
            path = path + "LGN1_MembraneSamples_65x65x1_mp0.50_train50000_valid10000_test10000_seed11.pkl.gz"
            path = serial.preprocess(path)

            f = gzip.open(path, 'rb')
            train_set, valid_set, test_set = cPickle.load(f)
            f.close()

            if which_set == 'train':
                data = train_set
            elif which_set == 'valid':
                data = valid_set
            else:
                data = test_set

            input_shape = (imgd, imgd, zd)

            # f = h5py.file(path, 'r')
            # input_shape = f['input_shape'][...]

            # if which_set == 'train':
            #     data = f['/train_set'][...]
            # elif which_set == 'valid':
            #     data = f['/valid_set'][...]
            # else:
            #     data = f['/test_set'][...]

            # Convert images to float 0-1
            topo_view = data[0].astype(np.float32) / 255.0
            y = data[1]

            self.one_hot = one_hot
            if one_hot:
                one_hot = N.zeros((y.shape[0], 2), dtype='float32')
                for i in xrange(y.shape[0]):
                    one_hot[i, y[i]] = 1.
                y = one_hot

            m = topo_view.shape[0]
            rows, cols, slices = input_shape
            topo_view = topo_view.reshape(m, rows, cols, slices)

            if center:
                topo_view -= topo_view.mean(axis=0)

            if shuffle:
                self.shuffle_rng = np.random.RandomState([1, 2, 3])
                for i in xrange(topo_view.shape[0]):
                    j = self.shuffle_rng.randint(m)
                    # Copy ensures that memory is not aliased.
                    tmp = topo_view[i, :, :, :].copy()
                    topo_view[i, :, :, :] = topo_view[j, :, :, :]
                    topo_view[j, :, :, :] = tmp
                    # Note: slicing with i:i+1 works for both one_hot=True/False.
                    tmp = y[i:i + 1].copy()
                    y[i] = y[j]
                    y[j] = tmp

            super(LGN, self).__init__(topo_view=dimshuffle(topo_view),
                                      y=y,
                                      axes=axes)

            assert not N.any(N.isnan(self.X))

            if start is not None:
                assert start >= 0
                if stop > self.X.shape[0]:
                    raise ValueError('stop=' + str(stop) + '>' + 'm=' +
                                     str(self.X.shape[0]))
                assert stop > start
                self.X = self.X[start:stop, :]
                if self.X.shape[0] != stop - start:
                    raise ValueError("X.shape[0]: %d. start: %d stop: %d" %
                                     (self.X.shape[0], start, stop))
                if len(self.y.shape) > 1:
                    self.y = self.y[start:stop, :]
                else:
                    self.y = self.y[start:stop]
                assert self.y.shape[0] == stop - start
        else:
            #data loading is disabled, just make something that defines the right topology
            topo = dimshuffle(np.zeros((1, 65, 65, 1)))
            super(LGN, self).__init__(topo_view=topo, axes=axes)
            self.X = None

        if which_set == 'test':
            assert fit_test_preprocessor is None or (fit_preprocessor
                                                     == fit_test_preprocessor)

        if self.X is not None and preprocessor:
            preprocessor.apply(self, fit_preprocessor)
Beispiel #34
0
    def __init__(self,
                 which_set,
                 center=False,
                 shuffle=False,
                 binarize=False,
                 start=None,
                 stop=None,
                 axes=['b', 0, 1, 'c'],
                 preprocessor=None,
                 fit_preprocessor=False,
                 fit_test_preprocessor=False):
        self.args = locals()

        if which_set not in ['train', 'test']:
            if which_set == 'valid':
                raise ValueError(
                    "There is no such thing as the MNIST validation set. MNIST"
                    "consists of 60,000 train examples and 10,000 test"
                    "examples. If you wish to use a validation set you should"
                    "divide the train set yourself. The pylearn2 dataset"
                    "implements and will only ever implement the standard"
                    "train / test split used in the literature.")
            raise ValueError('Unrecognized which_set value "%s".' %
                             (which_set, ) +
                             '". Valid values are ["train","test"].')

        def dimshuffle(b01c):
            """
            .. todo::

                WRITEME
            """
            default = ('b', 0, 1, 'c')
            return b01c.transpose(*[default.index(axis) for axis in axes])

        if control.get_load_data():
            path = "${PYLEARN2_DATA_PATH}/mnist/"
            if which_set == 'train':
                im_path = path + 'train-images-idx3-ubyte'
                label_path = path + 'train-labels-idx1-ubyte'
            else:
                assert which_set == 'test'
                im_path = path + 't10k-images-idx3-ubyte'
                label_path = path + 't10k-labels-idx1-ubyte'
            # Path substitution done here in order to make the lower-level
            # mnist_ubyte.py as stand-alone as possible (for reuse in, e.g.,
            # the Deep Learning Tutorials, or in another package).
            im_path = serial.preprocess(im_path)
            label_path = serial.preprocess(label_path)

            # Locally cache the files before reading them
            datasetCache = cache.datasetCache
            im_path = datasetCache.cache_file(im_path)
            label_path = datasetCache.cache_file(label_path)

            topo_view = read_mnist_images(im_path, dtype='float32')
            y = np.atleast_2d(read_mnist_labels(label_path)).T
        else:
            if which_set == 'train':
                size = 60000
            elif which_set == 'test':
                size = 10000
            else:
                raise ValueError('Unrecognized which_set value "%s".' %
                                 (which_set, ) +
                                 '". Valid values are ["train","test"].')
            topo_view = np.random.rand(size, 28, 28)
            y = np.random.randint(0, 10, (size, 1))

        if binarize:
            topo_view = (topo_view > 0.5).astype('float32')

        y_labels = 10

        m, r, c = topo_view.shape
        assert r == 28
        assert c == 28
        topo_view = topo_view.reshape(m, r, c, 1)

        if which_set == 'train':
            assert m == 60000
        elif which_set == 'test':
            assert m == 10000
        else:
            assert False

        if center:
            topo_view -= topo_view.mean(axis=0)

        if shuffle:
            self.shuffle_rng = make_np_rng(None, [1, 2, 3],
                                           which_method="shuffle")
            for i in xrange(topo_view.shape[0]):
                j = self.shuffle_rng.randint(m)
                # Copy ensures that memory is not aliased.
                tmp = topo_view[i, :, :, :].copy()
                topo_view[i, :, :, :] = topo_view[j, :, :, :]
                topo_view[j, :, :, :] = tmp

                tmp = y[i:i + 1].copy()
                y[i] = y[j]
                y[j] = tmp

        super(MNIST, self).__init__(topo_view=dimshuffle(topo_view),
                                    y=y,
                                    axes=axes,
                                    y_labels=y_labels)

        assert not N.any(N.isnan(self.X))

        if start is not None:
            assert start >= 0
            if stop > self.X.shape[0]:
                raise ValueError('stop=' + str(stop) + '>' + 'm=' +
                                 str(self.X.shape[0]))
            assert stop > start
            self.X = self.X[start:stop, :]
            if self.X.shape[0] != stop - start:
                raise ValueError("X.shape[0]: %d. start: %d stop: %d" %
                                 (self.X.shape[0], start, stop))
            if len(self.y.shape) > 1:
                self.y = self.y[start:stop, :]
            else:
                self.y = self.y[start:stop]
            assert self.y.shape[0] == stop - start

        if which_set == 'test':
            assert fit_test_preprocessor is None or \
                (fit_preprocessor == fit_test_preprocessor)

        if self.X is not None and preprocessor:
            preprocessor.apply(self, fit_preprocessor)