def train_nice(args): vn = True center = True if args.transposed: fmri = MRI.MRI_Transposed(dataset_name=args.dataset_name, even_input=True) input_dim = fmri.X.shape[1] del fmri else: data_path = serial.preprocess("${PYLEARN2_NI_PATH}/" + args.dataset_name) mask_file = path.join(data_path, "mask.npy") mask = np.load(mask_file) input_dim = (mask == 1).sum() if input_dim % 2 == 1: input_dim -= 1 logging.info("Input shape: %d" % input_dim) p = path.abspath(path.dirname(__file__)) yaml_file = path.join(p, "nice_%s.yaml" % args.dataset_name) user = path.expandvars("$USER") save_file = "nice_%s%s%s" % (args.dataset_name, "_transposed" if args.transposed else "", "_logistic" if args.logistic else "") save_path = serial.preprocess("/export/mialab/users/%s/pylearn2_outs/%s" % (user, save_file)) variance_map_file = path.join(data_path, "variance_map.npy") if not path.isfile(variance_map_file): raise ValueError("Variance map file %s not found." % variance_map_file) train(yaml_file, save_path, input_dim, args.transposed, args.logistic, variance_map_file)
def test_rbm(): save_path = path.join(serial.preprocess("${PYLEARN2_OUTS}"), "tutorials") if not path.isdir(serial.preprocess("${PYLEARN2_OUTS}")): raise IOError("PYLEARN2_OUTS environment variable not set") train_rbm.train_rbm(epochs = 1, save_path=save_path) mri_analysis.main(path.join(save_path, "rbm_smri.pkl"), save_path, "sz_t")
def main(args): dataset_name = args.dataset_name logger.info("Getting dataset info for %s" % dataset_name) data_path = serial.preprocess("${PYLEARN2_NI_PATH}/" + dataset_name) mask_file = path.join(data_path, "mask.npy") mask = np.load(mask_file) input_dim = (mask == 1).sum() user = path.expandvars("$USER") save_path = serial.preprocess("/export/mialab/users/%s/pylearn2_outs/%s" % (user, "rbm_simple_test")) # File parameters are path specific ones (not model specific). file_params = {"save_path": save_path, } yaml_template = open(yaml_file).read() hyperparams = expand(flatten(experiment.default_hyperparams(input_dim=input_dim)), dict_type=ydict) # Set additional hyperparams from command line args if args.learning_rate is not None: hyperparams["learning_rate"] = args.learning_rate if args.batch_size is not None: hyperparams["batch_size"] = args.batch_size for param in file_params: yaml_template = yaml_template.replace("%%(%s)s" % param, file_params[param]) yaml = yaml_template % hyperparams logger.info("Training") train = yaml_parse.load(yaml) train.main_loop()
def train_nice(args): vn = True center = True logger.info("Getting dataset info for %s" % args.dataset_name) data_path = serial.preprocess("${PYLEARN2_NI_PATH}/" + args.dataset_name) if args.transposed: logger.info("Data in transpose...") mri = MRI.MRI_Transposed(dataset_name=args.dataset_name, unit_normalize=True, even_input=True, apply_mask=True) input_dim = mri.X.shape[1] variance_map_file = path.join(data_path, "transposed_variance_map.npy") else: mask_file = path.join(data_path, "mask.npy") mask = np.load(mask_file) input_dim = (mask == 1).sum() if input_dim % 2 == 1: input_dim -= 1 mri = MRI.MRI_Standard(which_set="full", dataset_name=args.dataset_name, unit_normalize=True, even_input=True, apply_mask=True) variance_map_file = path.join(data_path, "variance_map.npy") save_variance_map(mri, variance_map_file) logger.info("Input shape: %d" % input_dim) p = path.abspath(path.dirname(__file__)) yaml_file = path.join(p, "nice_mri.yaml") user = path.expandvars("$USER") if args.out_name is not None: out_name = args.out_name else: out_name = args.dataset_name save_file = "nice_%s%s%s" % (out_name, "_transposed" if args.transposed else "", "_logistic" if args.logistic else "") save_path = serial.preprocess("/export/mialab/users/%s/pylearn2_outs/%s" % (user, save_file)) if path.isfile(save_path + ".pkl") or path.isfile(save_path + "_best.pkl"): answer = None while answer not in ["Y", "N", "y", "n"]: answer = raw_input("%s already exists, continuing will overwrite." "\nOverwrite? (Y/N)[N]: " % save_path) or "N" if answer not in ["Y", "N", "y", "n"]: print "Please answer Y or N" if answer in ["N", "n"]: print "If you want to run without overwrite, consider using the -o option." sys.exit() logger.info("Saving to prefix %s" % save_path) if not path.isfile(variance_map_file): raise ValueError("Variance map file %s not found." % variance_map_file) train(yaml_file, save_path, input_dim, args.transposed, args.logistic, variance_map_file, args.dataset_name)
def test_rbm(): save_path = path.join(serial.preprocess("${PYLEARN2_OUTS}"), "tutorials") if not path.isdir(serial.preprocess("${PYLEARN2_OUTS}")): raise IOError("PYLEARN2_OUTS environment variable not set") train_rbm.train_rbm(epochs = 1, save_path=save_path) show_weights.show_weights(path.join(save_path, "rbm_mnist.pkl"), out=path.join(save_path, "rbm_mnist_weights.png"))
def load_aod_gts(self): p = path.join(self.dataset_root, "aod_extra/") if not(path.isdir(serial.preprocess(p))): raise IOError("AOD extras directory %s not found." % serial.preprocess(p)) targets = np.load(serial.preprocess(p + "targets.npy")) novels = np.load(serial.preprocess(p + "novels.npy")) return targets, novels
def __init__(self, which_set, data_path=None, center=True, rescale=True, gcn=True): self.class_name = ['neg', 'pos'] # load data path = "${PYLEARN2_DATA_PATH}/cin/" #datapath = path + 'feature850-2-1.pkl' if data_path is None: data_path = path + 'feature850-2-1.pkl' else: data_path = path + data_path data_path = serial.preprocess(data_path) with open(data_path, 'rb') as f: #f = open(datapath, 'rb') train_set, valid_set, test_set = cPickle.load(f) #f.close() self.train_set = train_set self.valid_set = valid_set self.test_set = test_set if which_set == 'train': X, Y = self.train_set elif which_set == 'valid': X, Y = self.valid_set else: X, Y = self.test_set X.astype(float) axis = 0 _max = np.max(X, axis=axis) _min = np.min(X, axis=axis) _mean = np.mean(X, axis=axis) _std = np.std(X, axis=axis) _scale = _max - _min # print _max # print _min # print _mean # print _std if gcn: X = global_contrast_normalize(X, scale=gcn) else: if center: X[:, ] -= _mean if rescale: X[:, ] /= _scale # topo_view = X.reshape(X.shape[0], X.shape[1], 1, 1) # y = np.reshape(Y, (Y.shape[0], 1)) # y = np.atleast_2d(Y).T y = np.zeros((Y.shape[0], 2)) y[:, 0] = Y y[:, 0] = 1 - Y print X.shape, y.shape super(CIN_FEATURE2, self).__init__(X=X, y=y)
def __init__(self, jobs, db, name, updater, analyzer, alerter, reload=False): self.__dict__.update(locals()) self.table_dir = serial.preprocess(path.join(args.out_dir, self.name)) self.html = HTMLPage(self.name + " results") self.analyzer.start() self.updater.start()
def getFilename(i): base = path+'snapshot_' if i<10: out= base+'00%d.hdf5'%i elif i<100: out= base+'0%d.hdf5'%i else: out= base+'%d.hdf5'%i return serial.preprocess(out)
def test_data(): pylearn2_out_path = path.expandvars("$PYLEARN2_OUTS") assert pylearn2_out_path != "", ("PYLEARN2_OUTS environment variable is " "not set.") pylearn2_data_path = path.expandvars("$PYLEARN2_NI_PATH") assert pylearn2_data_path != "", ("PYLEARN2_NI_PATH environment" " variable is not set") data_path = serial.preprocess("${PYLEARN2_NI_PATH}/smri/") extras_path = serial.preprocess("${PYLEARN2_NI_PATH}/mri_extra/") try: assert path.isdir(data_path), data_path assert path.isdir(extras_path), extras_path except AssertionError as e: raise IOError("File or directory not found (%s), did you set your " "PYLEARN2_NI_PATH correctly? (%s)" % (e, data_path))
def __init__(self, which_set, start=None, stop=None, shuffle=False): if which_set not in ['train', 'valid']: if which_set == 'test': raise ValueError( "Currently test datasets not supported") raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["train","valid"].') p = "${PYLEARN2_NI_PATH}/snp/" if which_set == 'train': data_path = p + 'gen.chr1.npy' label_path = p + 'gen.chr1_labels.npy' else: assert which_set == 'test' data_path = p + 'test.npy' label_path = p + 'test_labels.npy' data_path = serial.preprocess(data_path) label_path = serial.preprocess(label_path) print "Loading data" topo_view = np.load(data_path) y = np.atleast_2d(np.load(label_path)).T samples, number_snps = topo_view.shape if start is not None: stop = stop if (stop <= samples) else samples assert 0 <= start < stop topo_view = topo_view[start:stop, :] y = y[start:stop] if shuffle: self.shuffle_rng = make_np_rng(None, default_seed=[1, 2, 3], which_method="shuffle") for i in xrange(samples): j = self.shuffle_rng.randint(samples) tmp = topo_view[i].copy() topo_view[i] = topo_view[j] topo_view[j] = tmp tmp = y[i,i+1].copy() y[i] = y[j] y[j] = tmp super(SNP, self).__init__(X=topo_view, y=y, y_labels=np.amax(y)+1)
def __init__(self, which_set, one_hot=False, axes=['b', 0, 1, 'c']): """ .. todo:: WRITEME """ self.args = locals() assert which_set in self.data_split.keys() path = serial.preprocess( "${PYLEARN2_DATA_PATH}/ocr_letters/letter.data") with open(path, 'r') as data_f: data = data_f.readlines() data = [line.split("\t") for line in data] data_x = [map(int, item[6:-1]) for item in data] data_letters = [item[1] for item in data] data_fold = [int(item[5]) for item in data] letters = list(numpy.unique(data_letters)) data_y = [letters.index(item) for item in data_letters] if which_set == 'train': split = slice(0, self.data_split['train']) elif which_set == 'valid': split = slice(self.data_split['train'], self.data_split['train'] + self.data_split['valid']) elif which_set == 'test': split = slice(self.data_split['train'] + self.data_split['valid'], (self.data_split['train'] + self.data_split['valid'] + self.data_split['test'])) data_x = numpy.asarray(data_x[split]) data_y = numpy.asarray(data_y[split]) data_fold = numpy.asarray(data_y[split]) assert data_x.shape[0] == data_y.shape[0] assert data_x.shape[0] == self.data_split[which_set] self.one_hot = one_hot if one_hot: one_hot = numpy.zeros( (data_y.shape[0], len(letters)), dtype='float32') for i in xrange(data_y.shape[0]): one_hot[i, data_y[i]] = 1. data_y = one_hot view_converter = dense_design_matrix.DefaultViewConverter( (16, 8, 1), axes) super(OCR, self).__init__( X=data_x, y=data_y, view_converter=view_converter) assert not contains_nan(self.X) self.fold = data_fold
def main(dataset_name="smri"): logger.info("Getting dataset info for %s" % args.dataset_name) data_path = serial.preprocess("${PYLEARN2_NI_PATH}/" + args.dataset_name) mask_file = path.join(data_path, "mask.npy") mask = np.load(mask_file) input_dim = (mask == 1).sum() if input_dim % 2 == 1: input_dim -= 1 mri = MRI.MRI_Standard(which_set="full", dataset_name=args.dataset_name, unit_normalize=True, even_input=True, apply_mask=True) variance_map_file = path.join(data_path, "variance_map.npy") mri_nifti.save_variance_map(mri, variance_map_file) user = path.expandvars("$USER") save_path = serial.preprocess("/export/mialab/users/%s/pylearn2_outs/%s" % (user, "jobman_test")) file_params = {"save_path": save_path, "variance_map_file": variance_map_file } yaml_template = open(yaml_file).read() hyperparams = expand(flatten(mlp_experiment.default_hyperparams(input_dim=input_dim)), dict_type=ydict) for param in hyperparams: if hasattr(args, param) and getattr(args, param): val = getattr(args, param) logger.info("Filling %s with %r" % (param, val)) hyperparams[param] = type(hyperparams[param])(val) for param in file_params: yaml_template = yaml_template.replace("%%(%s)s" % param, file_params[param]) yaml = yaml_template % hyperparams print yaml logger.info("Training") train = yaml_parse.load(yaml) train.main_loop()
def get_input_params(self, args, hyperparams): data_path = serial.preprocess("${PYLEARN2_NI_PATH}/" + args.dataset_name) data_class = hyperparams["data_class"] variance_normalize = hyperparams.get("variance_normalize", False) unit_normalize = hyperparams.get("unit_normalize", False) demean = hyperparams.get("demean", False) assert not (variance_normalize and unit_normalize) logger.info((data_class, variance_normalize, unit_normalize, demean)) h = hash((data_class, variance_normalize, unit_normalize, demean)) if self.d.get(h, False): return self.d[h] else: if data_class == "MRI_Transposed": assert not variance_normalize mri = MRI.MRI_Transposed(dataset_name=args.dataset_name, unit_normalize=unit_normalize, demean=demean, even_input=True, apply_mask=True) input_dim = mri.X.shape[1] variance_file_name = ("variance_map_transposed%s%s.npy" % ("_un" if unit_normalize else "", "_dm" if demean else "")) elif data_class == "MRI_Standard": assert not demean mask_file = path.join(data_path, "mask.npy") mask = np.load(mask_file) input_dim = (mask == 1).sum() if input_dim % 2 == 1: input_dim -= 1 mri = MRI.MRI_Standard(which_set="full", dataset_name=args.dataset_name, unit_normalize=unit_normalize, variance_normalize=variance_normalize, even_input=True, apply_mask=True) variance_file_name = ("variance_map%s%s.npy" % ("_un" if unit_normalize else "", "_vn" if variance_normalize else "")) logger.info(variance_file_name) logger.info((data_class, variance_normalize, unit_normalize, demean)) variance_map_file = path.join(data_path, variance_file_name) if not path.isfile(variance_map_file): logger.info("Saving variance file %s" % variance_map_file) mri_nifti.save_variance_map(mri, variance_map_file) self.d[h] = (input_dim, variance_map_file) return self.d[h]
def train_nice(): vn = True center = True smri = MRI.MRI_Transposed(dataset_name="smri", even_input=True) input_dim = smri.X.shape[1] p = path.abspath(path.dirname(__file__)) yaml_file = path.join(p, "nice_smri_transposed.yaml") user = path.expandvars("$USER") save_path = serial.preprocess("/export/mialab/users/%s/pylearn2_outs/" % user) assert path.isdir(save_path) train(yaml_file, save_path, input_dim)
def loadall(self): datasets = [] for i in range(self.foldn): filename = self.filestr.format(str(i + 1)) filename = dirpath + filename filename = serial.preprocess(filename) print "load data file: " + filename self.loadi(i, filename=filename) dataset = datasets[0] X, y = datasetXy # print X.shape, y.shape return datasets
def __init__(self, which_set='train', center=False, start=None, stop=None, axes=['b', 'c', 0, 1], preprocessor=None, fit_preprocessor=False, fit_test_preprocessor=False): self.shape = (8, 35, 57) self.size = {'train': 2849, 'valid': 2849, 'test': 2849} self.range = (-10, 10) self.path = "${PYLEARN2_DATA_PATH}/ecmwf/" self.set_path = {'train': 'ecmwf.train', 'valid': 'ecmwf.val', 'test': 'ecmwf.test'} self.args = locals() if which_set not in ['train', 'valid', 'test']: raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["train","valid","test"].') path = self.path + self.set_path[which_set] if control.get_load_data(): path = serial.preprocess(path) datasetCache = cache.datasetCache path = datasetCache.cache_file(path) X, topo_view, y = self._read_ecmwf(path, which_set) else: X = np.random.rand(self.size[which_set], np.prod(self.shape)) topo_view = np.random.rand(self.size[which_set]*np.prod(self.shape)) y = np.random.randint(self.range[0], self.range[1], (self.size[which_set], 1)) (m, v, r, c) = topo_view.shape if center: topo_view -= topo_view.mean(axis=0) super(ECMWF, self).__init__(X=X, topo_view=topo_view, y=y, axes=axes) assert not np.any(np.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop=' + str(stop) + '>' + 'm=' + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start if which_set == 'test': assert fit_test_preprocessor is None or \ (fit_preprocessor == fit_test_preprocessor) if self.X is not None and preprocessor: preprocessor.apply(self, fit_preprocessor)
def run_experiment(experiment, **kwargs): """ Experiment function. Used by jobman to run jobs. Must be loaded externally. TODO: add sigint handling. Parameters ---------- experiment: module Experiment module. kwargs: dict Typically hyperparameters. """ hyper_parameters = experiment.default_hyperparams() set_hyper_parameters(hyper_parameters, **kwargs) file_parameters = experiment.fileparams set_hyper_parameters(file_parameters, **kwargs) hyper_parameters.update(file_parameters) ih = MRIInputHandler() input_dim, variance_map_file = ih.get_input_params(hyper_parameters) hyper_parameters["nvis"] = input_dim hyper_parameters["variance_map_file"] = variance_map_file pid = os.getpid() out_path = serial.preprocess( hyper_parameters.get("out_path", "${PYLEARN2_OUTS}")) if not path.isdir(out_path): os.mkdir(out_path) if not path.isdir(path.join(out_path, "logs")): os.mkdir(path.join(out_path, "logs")) hyper_parameters = expand(flatten(hyper_parameters), dict_type=ydict) lh = LogHandler(experiment, hyper_parameters, out_path, pid) h = logging.StreamHandler(lh) monitor.log.addHandler(h) yaml_template = open(experiment.yaml_file).read() yaml = yaml_template % hyper_parameters train_object = yaml_parse.load(yaml) try: train_object.main_loop() lh.finish("COMPLETED") except KeyboardInterrupt: print("Quitting...") lh.finish("KILLED")
def loadall(self, dirpath="${PYLEARN2_DATA_PATH}/cin/", filestr="feature2086-2-{}.pkl", n=10): datasets = [] for i in range(n): filename = filestr.format(str(i + 1)) filename = dirpath + filename filename = serial.preprocess(filename) print "load data file: " + filename self.loadi(i, filename=filename) dataset = datasets[0] X, y = datasetXy print X.shape, y.shape return datasets
def get_mask(self): """ Get mask for dataset. Parameters ---------- Returns ------- mask: array-like 4D array of 1 and 0 values. """ p = path.join(self.dataset_root, self.dataset_name + "/") mask_path = serial.preprocess(p + "mask.npy") mask = np.load(mask_path) if not np.all(np.bitwise_or(mask == 0, mask == 1)): raise ValueError("Mask has incorrect values.") return mask
def test_data(): pylearn2_data_path = path.expandvars("$PYLEARN2_DATA_PATH") assert pylearn2_data_path != "", ("PYLEARN2_DATA_PATH environment" " variable is not set") data_path = serial.preprocess("${PYLEARN2_DATA_PATH}/mnist/") try: assert path.isdir(data_path), data_path assert path.isfile(path.join(data_path, "t10k-images-idx3-ubyte")),\ "t10k-images-idx3-ubyte" assert path.isfile(path.join(data_path, "t10k-labels-idx1-ubyte")),\ "t10k-labels-idx1-ubyte" assert path.isfile(path.join(data_path, "train-images-idx3-ubyte")),\ "train-images-idx3-ubyte" assert path.isfile(path.join(data_path, "train-labels-idx1-ubyte")),\ "train-labels-idx1-ubyte" except AssertionError as e: raise IOError("File or directory not found (%s), did you set " "PYLEARN2_DATA_PATH correctly? (%s)" % (e, data_path))
def _load_path(self, which_set, which_targets, word2vec_dict={}): if which_targets not in ['fine', 'coarse']: raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["fine","coarse"].') if which_set not in ['train', 'test']: raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["train","test"].') if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/TREC_question_type_data/" if which_set == 'train': data_path = path + 'trecqc.train_5500.label.txt' else: assert which_set == 'test' data_path = path + 'trecqc.test_500.label.txt' data_path = serial.preprocess(data_path) self.path = path return data_path
def train(yaml_file, save_path, epochs): yaml = open(yaml_file, "r").read() # Gets the input dimensionality from the mri mask data_path = serial.preprocess("${PYLEARN2_NI_PATH}/smri") logger.info("Loading data from %s" % data_path) mask_file = path.join(data_path, "mask.npy") mask = np.load(mask_file) input_dim = len(np.where(mask.flatten() == 1)[0].tolist()) del mask # Fills in the blanks of the yaml file hyperparams = {"nvis": input_dim, "batch_size": 5, "detector_layer_dim": 64, "monitoring_batches": 5, "save_path": save_path, "max_epochs": epochs } yaml = yaml % hyperparams train_yaml(yaml)
def test_multimodal_dbn(): skip.skip_if_no_data() yaml_file_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) save_path = serial.preprocess("${PYLEARN2_DATA_PATH}/experiment/multimodal_dbn/") train_image_layer1(yaml_file_path, save_path) train_image_layer2(yaml_file_path, save_path) train_text_layer1(yaml_file_path, save_path) train_text_layer2(yaml_file_path, save_path) get_representations_for_joint_layer(yaml_file_path, save_path, 100) train_joint_hidden_layer(yaml_file_path, save_path) # train_mlp(yaml_file_path, save_path) try: os.remove(save_path +"image_rbm1.pkl".format(save_path)) os.remove(save_path +"image_rbm2.pkl".format(save_path)) except OSError: pass
def __init__(self, name, which_set, image_format='png', image_converter='RGB'): if which_set not in ['train', 'test', 'valid']: raise ValueError( 'Unrecognized which_set value "%s".' % (which_set,) + '". Valid values are ["train","test", "valid"].') data_path = serial.preprocess('${PYLEARN2_DATA_PATH}') image_path = os.path.join(data_path, name, which_set) classes = {} with open(os.path.join(data_path, name, 'classes.csv'), 'r') as f: reader = csv.DictReader(f) for row in reader: classes[row['image']] = int(row['class']) nb_classes = len(set(classes.values())) imgs = [img for img in os.listdir(image_path) if img.endswith(image_format)] img = np.array(Image.open(os.path.join(image_path, imgs[0])).convert(image_converter)) data = np.zeros(shape=(len(imgs), img.shape[0], img.shape[1], img.shape[2] if len(img.shape) == 3 else 1)) y = np.zeros(shape=(len(imgs), nb_classes)) for i in range(0, len(imgs)): img = np.array(Image.open(os.path.join(image_path, imgs[i])) .convert(image_converter)) data[i] = img.reshape(img.shape[0], img.shape[1], img.shape[2] if len(img.shape) == 3 else 1) y[i][classes[imgs[i]]] = 1 super(ImageDataset, self).__init__(topo_view=data, y=y)
def resolve_dataset(self, which_set, dataset_name): """ Resolve the dataset from the file directories. Parameters ---------- which_set: str train, test, or full. """ p = path.join(self.dataset_root, dataset_name + "/") if not(path.isdir(serial.preprocess(p))): raise IOError("MRI dataset directory %s not found." % serial.preprocess(p)) if which_set == "train": data_path = p + "train.npy" label_path = p + "train_labels.npy" elif which_set == "test": data_path = p + "test.npy" label_path = p + "test_labels.npy" else: if which_set != "full": raise ValueError("dataset \"%s\" not supported." % which_set) data_path = p + "full_unshuffled.npy" label_path = p + "full_labels_unshuffled.npy" nifti_path = p + "base.nii" data_path = serial.preprocess(data_path) label_path = serial.preprocess(label_path) try: self.base_nifti = load_image(serial.preprocess(nifti_path)) logger.info("Loaded nifti") except IOError: self.base_nifti = None logger.warn("`base.nii` not in dataset directory. " "You may need to reprocess.") if not(path.isfile(data_path)): raise ValueError("Dataset %s not found in %s" % (which_set, serial.preprocess(p))) return data_path, label_path
def make_data(self, which_set, p, center=False, variance_normalize=False, shuffle=False, save_dummy=False): """ Function to make h5 file. Note: parameters the same as __init__ function. """ logger.info("Making h5 file for %s" % which_set) if which_set == "train": source_path = serial.preprocess(p + "train.npy") data_path = serial.preprocess(p + "train.h5") label_path = serial.preprocess(p + "train_labels.npy") else: assert which_set == "test" source_path = serial.preprocess(p + "test.npy") data_path = serial.preprocess(p + "test.h5") label_path = serial.preprocess(p + "test_labels.npy") data_path = "".join(data_path.split(".")[0] + "_dummy.h5") # Get the topological view and labels. topo_view = np.load(source_path) y = np.load(label_path) num_labels = np.amax(y) + 1 # Shape information and mask. samples, rows, columns, depth = topo_view.shape if self.mask is not None: assert self.mask.shape == (rows, columns, depth) size = len(np.where(self.mask.flatten() == 1)[0].tolist()) else: size = rows * columns * depth self.view_converter = MRIViewConverter((rows, columns, depth)) X = self.view_converter.topo_view_to_design_mat(topo_view, self.mask) # TODO(dhjelm): one_hot is going away. one_hot = np.zeros((size, num_labels), dtype=config.floatX) for i in xrange(y.shape[0]): one_hot[i, y[i] - 1] = 1. if center: X -= X.mean(axis=0) if variance_normalize: X /= X.std(axis=0) rng = make_np_rng(None, 322, which_method="shuffle") if shuffle: index = range(samples) rng.shuffle(index) X = X[index, :] one_hot = one_hot[index, :] assert not np.any(np.isnan(X)) h5file, node = self.init_hdf5(data_path, ([samples, size], [samples, num_labels])) MRI_Big.fill_hdf5(h5file, X, one_hot, node) h5file.close()
def __init__(self, which_set, center=False, variance_normalize=False, shuffle=False, apply_mask=False, preprocessor=None, dataset_name="smri", reprocess=False, save_dummy=False): """ Parameters ---------- which_set: string "train" or "test" center: bool If True, then data -> data - data.mean() variance_normalize: True If True, then data -> data / data.std() shuffle: bool If True, then shuffle data when writing h5 (does nothing if not processing an h5). apply_mask: bool: If True, then the h5 file is masked with a mask file found in the data directory. preprocessor: not supported yet, TODO. dataset_name: string Dataset sub-directory name from ${PYLEARN_NI_PATH} reprocess: bool Some might want to reprocess the h5 file. save_dummy: bool Use a dummy file. This is for tests. """ logger.warn("This class is deprecated and needs to be refactored.") if not path.isdir(serial.preprocess("${PYLEARN2_NI_PATH}")): raise ValueError("Did you set the PYLEARN_NI_PATH variable?") if which_set not in ["train", "test"]: if which_set == "valid": raise ValueError( "Currently validation dataset is not supported with" "sMRI. This can be added in smri_nifti.py.") raise ValueError( "Unrecognized which_set value %s " % (which_set)) self.__dict__.update(locals()) del self.self p = "${PYLEARN2_NI_PATH}/%s/" % dataset_name assert path.isdir(p), ("No NI data directory called %s" % dataset_name) if which_set == "train": data_path = p + "train.h5" else: assert which_set == "test" data_path = p + "test.h5" # Dummy file is for tests, don"t want to resave over data we might actually be # using every time we run a test. if save_dummy: data_path = "".join(data_path.split(".")[0] + "_dummy.h5") data_path = serial.preprocess(data_path) # Load the mask file and retrieve shape information. self.mask = None mask_path = serial.preprocess(p + "mask.npy") if not path.isfile(mask_path): raise IOError("No mask found in %s." "This file is needed to retrieve shape information." "Are you sure this is a MRI dataset?" % mask_path) mask = np.load(mask_path) rows, columns, depth = mask.shape if apply_mask: self.mask = mask # Make the h5 file if not present or if reprocess flag is set. if not os.path.isfile(data_path) or reprocess: self.filters = tables.Filters(complib="blosc", complevel=5) self.make_data(which_set, serial.preprocess(p), center=center, variance_normalize=variance_normalize, shuffle=shuffle, save_dummy=save_dummy) self.h5file = tables.openFile(data_path) data = self.h5file.getNode("/", "Data") view_converter = MRIViewConverter((rows, columns, depth)) super(MRI_Big, self).__init__(X=data.X, y=data.y, view_converter=view_converter) self.h5file.flush()
def __init__(self, which_set, center=False, shuffle=False, one_hot=False, binarize=False, start=None, stop=None, axes=['b', 0, 1, 'c'], preprocessor=None, fit_preprocessor=False, fit_test_preprocessor=False): self.args = locals() if which_set not in ['train', 'test']: if which_set == 'valid': raise ValueError( "There is no such thing as the MNIST " "validation set. MNIST consists of 60,000 train examples and 10,000 test" " examples. If you wish to use a validation set you should divide the train " "set yourself. The pylearn2 dataset implements and will only ever implement " "the standard train / test split used in the literature.") raise ValueError('Unrecognized which_set value "%s".' % (which_set, ) + '". Valid values are ["train","test"].') def dimshuffle(b01c): default = ('b', 0, 1, 'c') return b01c.transpose(*[default.index(axis) for axis in axes]) if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/mnist/" if which_set == 'train': im_path = path + 'train-images-idx3-ubyte' label_path = path + 'train-labels-idx1-ubyte' else: assert which_set == 'test' im_path = path + 't10k-images-idx3-ubyte' label_path = path + 't10k-labels-idx1-ubyte' # Path substitution done here in order to make the lower-level # mnist_ubyte.py as stand-alone as possible (for reuse in, e.g., # the Deep Learning Tutorials, or in another package). im_path = serial.preprocess(im_path) label_path = serial.preprocess(label_path) topo_view = read_mnist_images(im_path, dtype='float32') y = read_mnist_labels(label_path) if binarize: topo_view = (topo_view > 0.5).astype('float32') self.one_hot = one_hot if one_hot: one_hot = N.zeros((y.shape[0], 10), dtype='float32') for i in xrange(y.shape[0]): one_hot[i, y[i]] = 1. y = one_hot m, r, c = topo_view.shape assert r == 28 assert c == 28 topo_view = topo_view.reshape(m, r, c, 1) if which_set == 'train': assert m == 60000 elif which_set == 'test': assert m == 10000 else: assert False if center: topo_view -= topo_view.mean(axis=0) if shuffle: self.shuffle_rng = np.random.RandomState([1, 2, 3]) for i in xrange(topo_view.shape[0]): j = self.shuffle_rng.randint(m) # Copy ensures that memory is not aliased. tmp = topo_view[i, :, :, :].copy() topo_view[i, :, :, :] = topo_view[j, :, :, :] topo_view[j, :, :, :] = tmp # Note: slicing with i:i+1 works for both one_hot=True/False. tmp = y[i:i + 1].copy() y[i] = y[j] y[j] = tmp super(MNIST, self).__init__(topo_view=dimshuffle(topo_view), y=y, axes=axes) assert not N.any(N.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop=' + str(stop) + '>' + 'm=' + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start else: # data loading is disabled, just make something that defines the # right topology topo = dimshuffle(np.zeros((1, 28, 28, 1))) super(MNIST, self).__init__(topo_view=topo, axes=axes) self.X = None if which_set == 'test': assert fit_test_preprocessor is None or (fit_preprocessor == fit_test_preprocessor) if self.X is not None and preprocessor: preprocessor.apply(self, fit_preprocessor)
def __init__(self, which_set, center=False, shuffle=False, one_hot=None, binarize=False, start=None, stop=None, axes=['b', 0, 1, 'c'], preprocessor=None, fit_preprocessor=False, fit_test_preprocessor=False): self.args = locals() if which_set not in ['train', 'test']: if which_set == 'valid': raise ValueError( "There is no such thing as the MNIST validation set. MNIST" "consists of 60,000 train examples and 10,000 test" "examples. If you wish to use a validation set you should" "divide the train set yourself. The pylearn2 dataset" "implements and will only ever implement the standard" "train / test split used in the literature.") raise ValueError('Unrecognized which_set value "%s".' % (which_set, ) + '". Valid values are ["train","test"].') def dimshuffle(b01c): default = ('b', 0, 1, 'c') return b01c.transpose(*[default.index(axis) for axis in axes]) if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/mnist/" if which_set == 'train': im_path = path + 'train-images-idx3-ubyte' label_path = path + 'train-labels-idx1-ubyte' else: assert which_set == 'test' im_path = path + 't10k-images-idx3-ubyte' label_path = path + 't10k-labels-idx1-ubyte' # Path substitution done here in order to make the lower-level # mnist_ubyte.py as stand-alone as possible (for reuse in, e.g., # the Deep Learning Tutorials, or in another package). im_path = serial.preprocess(im_path) label_path = serial.preprocess(label_path) # Locally cache the files before reading them datasetCache = cache.datasetCache im_path = datasetCache.cache_file(im_path) label_path = datasetCache.cache_file(label_path) topo_view = read_mnist_images(im_path, dtype='float32') y = read_mnist_labels(label_path) if binarize: topo_view = (topo_view > 0.5).astype('float32') max_labels = 10 if one_hot is not None: warnings.warn( "the `one_hot` parameter is deprecated. To get " "one-hot encoded targets, request that they " "live in `VectorSpace` through the `data_specs` " "parameter of MNIST's iterator method. " "`one_hot` will be removed on or after " "September 20, 2014.", stacklevel=2) m, r, c = topo_view.shape assert r == 28 assert c == 28 topo_view = topo_view.reshape(m, r, c, 1) if which_set == 'train': assert m == 60000 elif which_set == 'test': assert m == 10000 else: assert False if center: topo_view -= topo_view.mean(axis=0) if shuffle: self.shuffle_rng = make_np_rng(None, [1, 2, 3], which_method="shuffle") for i in xrange(topo_view.shape[0]): j = self.shuffle_rng.randint(m) # Copy ensures that memory is not aliased. tmp = topo_view[i, :, :, :].copy() topo_view[i, :, :, :] = topo_view[j, :, :, :] topo_view[j, :, :, :] = tmp # Note: slicing with i:i+1 works for one_hot=True/False tmp = y[i:i + 1].copy() y[i] = y[j] y[j] = tmp super(MNIST, self).__init__(topo_view=dimshuffle(topo_view), y=y, axes=axes, max_labels=max_labels) assert not N.any(N.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop=' + str(stop) + '>' + 'm=' + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start else: # data loading is disabled, just make something that defines the # right topology topo = dimshuffle(np.zeros((1, 28, 28, 1))) super(MNIST, self).__init__(topo_view=topo, axes=axes) self.X = None if which_set == 'test': assert fit_test_preprocessor is None or \ (fit_preprocessor == fit_test_preprocessor) if self.X is not None and preprocessor: preprocessor.apply(self, fit_preprocessor)
def main(model, out_path=None, prefix=None, **anal_args): """ Main function of module. This function controls the high end analysis functions. Parameters ---------- model: Pylearn2.Model or str Model instance or path for the model. out_path: str, optional Path for the output directory. prefix: str, optional If provided, prefix for all output files. dataset_root: str, optional If provided, use as the root dir for dataset extraction. anal_args: dict argparse arguments (defined below). """ if out_path is None and prefix is None and isinstance(model, str): prefix = ".".join(path.basename(model).split(".")[:-1]) sm_prefix = prefix nifti_prefix = prefix else: nifti_prefix = "image" if out_path is None: assert isinstance(model, str), ("If you provide a model object, you " "must provide an out_path") out_path = path.abspath(path.dirname(model)) if isinstance(model, str): logger.info("Loading model from %s" % model) model = serial.load(model) if not path.isdir(out_path): os.mkdir(out_path) logger.info("Getting features") feature_dict = fe.extract_features(model, **anal_args) dataset = feature_dict.pop("dataset") if isinstance(dataset, TransformerDataset): dataset = dataset.raw ms = fe.ModelStructure(model, dataset) data_path = serial.preprocess(dataset.dataset_root + dataset.dataset_name) sim_dict_file = path.join(data_path, "sim_dict.pkl") sim_dict = pickle.load(open(sim_dict_file, "r")) analyze_ground_truth(feature_dict, sim_dict, dataset) anal_dict = dict() mask = dataset.get_mask() feature_dict["mask"] = fe.Features(np.array([mask]), np.array([[0]]), name="mask") if isinstance(dataset, MRI.MRI_Transposed): samples = dataset.X[:, :20].T else: samples = dataset.X[:20] feature_dict["samples"] = fe.Features(samples, np.array([[0] * 20]).T, name="samples") if isinstance(dataset, MRI.MRI_Transposed): mean_image = dataset.X.mean(axis=1).T else: mean_image = dataset.X.mean(axis=0) feature_dict["mean_image"] = fe.Features(np.array([mean_image]), np.array([[0]]).T, name="mean image") if dataset.variance_map is not None: variance_map = dataset.variance_map[1] feature_dict["variance_map"] = fe.Features(np.array([variance_map]), np.array([[0]]).T, name="variance map") for name, features in feature_dict.iteritems(): image_dir = path.join(out_path, "%s_images" % name) if not path.isdir(image_dir): os.mkdir(image_dir) save_simtb_spatial_maps(dataset, features, image_dir) features.set_histograms(tolist=True) fds = dict() for k, f in features.f.iteritems(): fd = dict( image=path.join("%s_images" % name, "%d.png" % f.id), image_type="simtb", index=f.id, hists=f.hists, match_indices=f.match_indices ) fd.update(**f.stats) fds[k] = fd anal_dict[name] = dict( name=name, image_dir=image_dir, features=fds ) json_file = path.join(out_path, "analysis.json") with open(json_file, "w") as f: json.dump(anal_dict, f) logger.info("Done.")
def train(yaml_file, save_path, epochs): yaml = open(yaml_file, "r").read() input_dim = 784 # MNIST input size # Fills in the blanks of the yaml file hyperparams = {"nvis": input_dim, "batch_size": 50, "detector_layer_dim": 200, "monitoring_batches": 10, "train_stop": 50000, "max_epochs": epochs, "save_path": save_path } yaml = yaml % hyperparams train_yaml(yaml) def train_rbm(epochs = 300, save_path=None): # Load the yaml file yaml_file = path.join(path.abspath(path.dirname(__file__)), "rbm.yaml") if save_path is None: save_path = path.abspath(path.dirname(__file__)) train(yaml_file, save_path, epochs) if __name__ == "__main__": save_path = path.join(serial.preprocess("${PYLEARN2_OUTS}"), "tutorials") if not path.isdir(serial.preprocess("${PYLEARN2_OUTS}")): raise IOError("PYLEARN2_OUTS environment variable not set") if not path.isdir(save_path): os.mkdir(save_path) train_rbm(save_path=save_path)
def __init__(self, which_set, imgd=65, zd=1, ds=1, center=False, shuffle=False, one_hot=False, binarize=False, start=None, stop=None, axes=['b', 0, 1, 'c'], preprocessor=None, fit_preprocessor=False, fit_test_preprocessor=False): self.args = locals() if which_set not in ['train', 'valid', 'test']: raise ValueError('Unrecognized which_set value "%s".' % (which_set, ) + '". Valid values are ["train","valid",test"].') def dimshuffle(b01c): default = ('b', 0, 1, 'c') return b01c.transpose(*[default.index(axis) for axis in axes]) if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/lgn/" path = path + "LGN1_MembraneSamples_65x65x1_mp0.50_train50000_valid10000_test10000_seed11.pkl.gz" path = serial.preprocess(path) f = gzip.open(path, 'rb') train_set, valid_set, test_set = cPickle.load(f) f.close() if which_set == 'train': data = train_set elif which_set == 'valid': data = valid_set else: data = test_set input_shape = (imgd, imgd, zd) # f = h5py.file(path, 'r') # input_shape = f['input_shape'][...] # if which_set == 'train': # data = f['/train_set'][...] # elif which_set == 'valid': # data = f['/valid_set'][...] # else: # data = f['/test_set'][...] # Convert images to float 0-1 topo_view = data[0].astype(np.float32) / 255.0 y = data[1] self.one_hot = one_hot if one_hot: one_hot = N.zeros((y.shape[0], 2), dtype='float32') for i in xrange(y.shape[0]): one_hot[i, y[i]] = 1. y = one_hot m = topo_view.shape[0] rows, cols, slices = input_shape topo_view = topo_view.reshape(m, rows, cols, slices) if center: topo_view -= topo_view.mean(axis=0) if shuffle: self.shuffle_rng = np.random.RandomState([1, 2, 3]) for i in xrange(topo_view.shape[0]): j = self.shuffle_rng.randint(m) # Copy ensures that memory is not aliased. tmp = topo_view[i, :, :, :].copy() topo_view[i, :, :, :] = topo_view[j, :, :, :] topo_view[j, :, :, :] = tmp # Note: slicing with i:i+1 works for both one_hot=True/False. tmp = y[i:i + 1].copy() y[i] = y[j] y[j] = tmp super(LGN, self).__init__(topo_view=dimshuffle(topo_view), y=y, axes=axes) assert not N.any(N.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop=' + str(stop) + '>' + 'm=' + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start else: #data loading is disabled, just make something that defines the right topology topo = dimshuffle(np.zeros((1, 65, 65, 1))) super(LGN, self).__init__(topo_view=topo, axes=axes) self.X = None if which_set == 'test': assert fit_test_preprocessor is None or (fit_preprocessor == fit_test_preprocessor) if self.X is not None and preprocessor: preprocessor.apply(self, fit_preprocessor)
def __init__(self, which_set, center=False, shuffle=False, binarize=False, start=None, stop=None, axes=['b', 0, 1, 'c'], preprocessor=None, fit_preprocessor=False, fit_test_preprocessor=False): self.args = locals() if which_set not in ['train', 'test']: if which_set == 'valid': raise ValueError( "There is no such thing as the MNIST validation set. MNIST" "consists of 60,000 train examples and 10,000 test" "examples. If you wish to use a validation set you should" "divide the train set yourself. The pylearn2 dataset" "implements and will only ever implement the standard" "train / test split used in the literature.") raise ValueError('Unrecognized which_set value "%s".' % (which_set, ) + '". Valid values are ["train","test"].') def dimshuffle(b01c): """ .. todo:: WRITEME """ default = ('b', 0, 1, 'c') return b01c.transpose(*[default.index(axis) for axis in axes]) if control.get_load_data(): path = "${PYLEARN2_DATA_PATH}/mnist/" if which_set == 'train': im_path = path + 'train-images-idx3-ubyte' label_path = path + 'train-labels-idx1-ubyte' else: assert which_set == 'test' im_path = path + 't10k-images-idx3-ubyte' label_path = path + 't10k-labels-idx1-ubyte' # Path substitution done here in order to make the lower-level # mnist_ubyte.py as stand-alone as possible (for reuse in, e.g., # the Deep Learning Tutorials, or in another package). im_path = serial.preprocess(im_path) label_path = serial.preprocess(label_path) # Locally cache the files before reading them datasetCache = cache.datasetCache im_path = datasetCache.cache_file(im_path) label_path = datasetCache.cache_file(label_path) topo_view = read_mnist_images(im_path, dtype='float32') y = np.atleast_2d(read_mnist_labels(label_path)).T else: if which_set == 'train': size = 60000 elif which_set == 'test': size = 10000 else: raise ValueError('Unrecognized which_set value "%s".' % (which_set, ) + '". Valid values are ["train","test"].') topo_view = np.random.rand(size, 28, 28) y = np.random.randint(0, 10, (size, 1)) if binarize: topo_view = (topo_view > 0.5).astype('float32') y_labels = 10 m, r, c = topo_view.shape assert r == 28 assert c == 28 topo_view = topo_view.reshape(m, r, c, 1) if which_set == 'train': assert m == 60000 elif which_set == 'test': assert m == 10000 else: assert False if center: topo_view -= topo_view.mean(axis=0) if shuffle: self.shuffle_rng = make_np_rng(None, [1, 2, 3], which_method="shuffle") for i in xrange(topo_view.shape[0]): j = self.shuffle_rng.randint(m) # Copy ensures that memory is not aliased. tmp = topo_view[i, :, :, :].copy() topo_view[i, :, :, :] = topo_view[j, :, :, :] topo_view[j, :, :, :] = tmp tmp = y[i:i + 1].copy() y[i] = y[j] y[j] = tmp super(MNIST, self).__init__(topo_view=dimshuffle(topo_view), y=y, axes=axes, y_labels=y_labels) assert not N.any(N.isnan(self.X)) if start is not None: assert start >= 0 if stop > self.X.shape[0]: raise ValueError('stop=' + str(stop) + '>' + 'm=' + str(self.X.shape[0])) assert stop > start self.X = self.X[start:stop, :] if self.X.shape[0] != stop - start: raise ValueError("X.shape[0]: %d. start: %d stop: %d" % (self.X.shape[0], start, stop)) if len(self.y.shape) > 1: self.y = self.y[start:stop, :] else: self.y = self.y[start:stop] assert self.y.shape[0] == stop - start if which_set == 'test': assert fit_test_preprocessor is None or \ (fit_preprocessor == fit_test_preprocessor) if self.X is not None and preprocessor: preprocessor.apply(self, fit_preprocessor)