def __init__(self, dataset_dir='../../datasets'): filename = 'Nottingham.zip' source = 'http://www-etud.iro.umontreal.ca/~boulanni/Nottingham.zip' super(Nottingham, self).__init__(filename=filename, source=source, dataset_dir=dataset_dir) # now the file path has been installed to self.dataset_locations directory # grab the appropriate filenames train_filenames = os.path.join(self.dataset_location, 'Nottingham', 'train', '*.mid') valid_filenames = os.path.join(self.dataset_location, 'Nottingham', 'valid', '*.mid') test_filenames = os.path.join(self.dataset_location, 'Nottingham', 'test', '*.mid') # glob the files train_files = glob.glob(train_filenames) valid_files = glob.glob(valid_filenames) test_files = glob.glob(test_filenames) # grab the datasets from midireading the files train_datasets = [midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(theano.config.floatX) for f in train_files] valid_datasets = [midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(theano.config.floatX) for f in valid_files] test_datasets = [midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(theano.config.floatX) for f in test_files] # get the data shapes self.train_shapes = [train.shape for train in train_datasets] self.valid_shapes = [valid.shape for valid in valid_datasets] self.test_shapes = [test.shape for test in test_datasets] # put them into shared variables log.debug('Putting Nottingham into theano shared variables') self.train = dataset_shared(numpy.concatenate(train_datasets), name='nottingham_train', borrow=True) self.valid = dataset_shared(numpy.concatenate(valid_datasets), name='nottingham_valid', borrow=True) self.test = dataset_shared(numpy.concatenate(test_datasets), name='nottingham_test', borrow=True)
def __init__(self, dataset_dir='../../datasets'): """ Parameters ---------- dataset_dir : str The `dataset_dir` parameter to a ``FileDataset``. """ log.debug("Loading MuseData midi dataset...") filename = 'MuseData.zip' source = 'http://www-etud.iro.umontreal.ca/~boulanni/MuseData.zip' super(MuseData, self).__init__(filename=filename, source=source, dataset_dir=dataset_dir) # now the file path has been installed to self.dataset_locations directory # grab the appropriate filenames train_filenames = os.path.join(self.dataset_location, 'MuseData', 'train', '*.mid') valid_filenames = os.path.join(self.dataset_location, 'MuseData', 'valid', '*.mid') test_filenames = os.path.join(self.dataset_location, 'MuseData', 'test', '*.mid') # glob the files train_files = glob.glob(train_filenames) valid_files = glob.glob(valid_filenames) test_files = glob.glob(test_filenames) # grab the datasets from midireading the files train_datasets = [ midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(theano.config.floatX) for f in train_files ] valid_datasets = [ midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(theano.config.floatX) for f in valid_files ] test_datasets = [ midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(theano.config.floatX) for f in test_files ] # get the data shapes self.train_shapes = [train.shape for train in train_datasets] self.valid_shapes = [valid.shape for valid in valid_datasets] self.test_shapes = [test.shape for test in test_datasets] # put them into shared variables log.debug('Putting MuseData into theano shared variables') self.train = dataset_shared(numpy.concatenate(train_datasets), name='muse_train', borrow=True) self.valid = dataset_shared(numpy.concatenate(valid_datasets), name='muse_valid', borrow=True) self.test = dataset_shared(numpy.concatenate(test_datasets), name='muse_test', borrow=True)
def __init__(self, binary=False, one_hot=False, concat_train_valid=True, dataset_dir='../../datasets', sequence_number=0, rng=None): # instantiate the Dataset class to install the dataset from the url log.info('Loading MNIST with binary=%s and one_hot=%s', str(binary), str(one_hot)) filename = 'mnist.pkl.gz' source = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' super(MNIST, self).__init__(filename=filename, source=source, dataset_dir=dataset_dir) # self.dataset_location now contains the os path to the dataset file # self.file_type tells how to load the dataset # load the dataset into memory if PY3: pickle_load = partial(pickle.load, encoding='bytes') else: pickle_load = partial(pickle.load) if self.file_type is file_ops.GZ: (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle_load( gzip.open(self.dataset_location, 'rb') ) else: (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle_load( open(self.dataset_location, 'r') ) if concat_train_valid: log.debug('Concatenating train and valid sets together...') self.train_X = numpy.concatenate((self.train_X, self.valid_X)) self.train_Y = numpy.concatenate((self.train_Y, self.valid_Y)) # sequence the dataset if sequence_number is not None: self.sequence(sequence_number=sequence_number, rng=rng) # make optional binary if binary: _binary_cutoff = 0.5 log.debug('Making MNIST X values binary with cutoff %s', str(_binary_cutoff)) self.train_X = (self.train_X > _binary_cutoff).astype('float32') self.valid_X = (self.valid_X > _binary_cutoff).astype('float32') self.test_X = (self.test_X > _binary_cutoff).astype('float32') # make optional one-hot labels if one_hot: self.train_Y = numpy_one_hot(self.train_Y, n_classes=10) self.valid_Y = numpy_one_hot(self.valid_Y, n_classes=10) self.test_Y = numpy_one_hot(self.test_Y, n_classes=10) log.debug("loading datasets into shared variables") self.train_X = dataset_shared(self.train_X, name='mnist_train_x', borrow=True) self.train_Y = dataset_shared(self.train_Y, name='mnist_train_y', borrow=True) self.valid_X = dataset_shared(self.valid_X, name='mnist_valid_x', borrow=True) self.valid_Y = dataset_shared(self.valid_Y, name='mnist_valid_y', borrow=True) self.test_X = dataset_shared(self.test_X, name='mnist_test_x', borrow=True) self.test_Y = dataset_shared(self.test_Y, name='mnist_test_y', borrow=True)
def __init__(self, train_X, train_Y=None, valid_X=None, valid_Y=None, test_X=None, test_Y=None): log.info('Wrapping matrix from memory') super(self.__class__, self).__init__() # make sure the inputs are arrays train_X = numpy.array(train_X) self._train_shape = train_X.shape self.train_X = dataset_shared(train_X, name='memory_train_x', borrow=True) if train_Y is not None: try: self.train_Y = dataset_shared(numpy.array(train_Y), name='memory_train_y', borrow=True) except Exception as e: log.exception("COULD NOT CONVERT train_Y TO NUMPY ARRAY. EXCEPTION: %s", str(e)) if valid_X is not None: try: valid_X = numpy.array(valid_X) self._valid_shape = valid_X.shape self.valid_X = dataset_shared(valid_X, name='memory_valid_x', borrow=True) except Exception as e: log.exception("COULD NOT CONVERT valid_X TO NUMPY ARRAY. EXCEPTION: %s", str(e)) if valid_Y is not None: try: self.valid_Y = dataset_shared(numpy.array(valid_Y), name='memory_valid_y', borrow=True) except Exception as e: log.exception("COULD NOT CONVERT valid_Y TO NUMPY ARRAY. EXCEPTION: %s", str(e)) if test_X is not None: try: test_X = numpy.array(test_X) self._test_shape = test_X.shape self.test_X = dataset_shared(test_X, name='memory_test_x', borrow=True) except Exception as e: log.exception("COULD NOT CONVERT test_X TO NUMPY ARRAY. EXCEPTION: %s", str(e)) if test_Y is not None: try: self.test_Y = dataset_shared(numpy.array(test_Y), name='memory_test_y', borrow=True) except Exception as e: log.exception("COULD NOT CONVERT test_Y TO NUMPY ARRAY. EXCEPTION: %s", str(e))
def __init__(self, train_X, train_Y=None, valid_X=None, valid_Y=None, test_X=None, test_Y=None): log.info('Wrapping matrix from memory') # make sure the inputs are arrays train_X = numpy.array(train_X) self._train_shape = train_X.shape self.train_X = dataset_shared(train_X, name='memory_train_x', borrow=True) if train_Y is not None: try: self.train_Y = dataset_shared(numpy.array(train_Y), name='memory_train_y', borrow=True) except Exception as e: log.exception("COULD NOT CONVERT train_Y TO NUMPY ARRAY. EXCEPTION: %s", str(e)) if valid_X is not None: try: valid_X = numpy.array(valid_X) self._valid_shape = valid_X.shape self.valid_X = dataset_shared(valid_X, name='memory_valid_x', borrow=True) except Exception as e: log.exception("COULD NOT CONVERT valid_X TO NUMPY ARRAY. EXCEPTION: %s", str(e)) if valid_Y is not None: try: self.valid_Y = dataset_shared(numpy.array(valid_Y), name='memory_valid_y', borrow=True) except Exception as e: log.exception("COULD NOT CONVERT valid_Y TO NUMPY ARRAY. EXCEPTION: %s", str(e)) if test_X is not None: try: test_X = numpy.array(test_X) self._test_shape = test_X.shape self.test_X = dataset_shared(test_X, name='memory_test_x', borrow=True) except Exception as e: log.exception("COULD NOT CONVERT test_X TO NUMPY ARRAY. EXCEPTION: %s", str(e)) if test_Y is not None: try: self.test_Y = dataset_shared(numpy.array(test_Y), name='memory_test_y', borrow=True) except Exception as e: log.exception("COULD NOT CONVERT test_Y TO NUMPY ARRAY. EXCEPTION: %s", str(e))
def __init__(self, binary=False, binary_cutoff=0.5, one_hot=False, concat_train_valid=False, dataset_dir='../../datasets', sequence_number=0, rng=None): """ Parameters ---------- binary : bool, optional Flag to binarize the input images. binary_cutoff : float, optional If you want to binarize the input images, what threshold value to use. one_hot : bool, optional Flag to convert the labels to one-hot encoding rather than their normal integers. concat_train_valid : bool, optional Flag to concatenate the training and validation datasets together. This would be the original split. dataset_dir : str, optional The `dataset_dir` parameter to a ``FileDataset``. sequence_number : int, optional The sequence method to use if we want to put the input images into a specific order. 0 defaults to random. rng : random, optional The random number generator to use when sequencing. """ # instantiate the Dataset class to install the dataset from the url log.info('Loading MNIST with binary=%s and one_hot=%s', str(binary), str(one_hot)) filename = 'mnist.pkl.gz' source = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' super(MNIST, self).__init__(filename=filename, source=source, dataset_dir=dataset_dir) # self.dataset_location now contains the os path to the dataset file # self.file_type tells how to load the dataset # load the dataset into memory if self.file_type is file_ops.GZ: (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle.load( gzip.open( self.dataset_location, 'rb')) else: (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle.load( open(self.dataset_location, 'r')) if concat_train_valid: log.debug('Concatenating train and valid sets together...') self.train_X = numpy.concatenate((self.train_X, self.valid_X)) self.train_Y = numpy.concatenate((self.train_Y, self.valid_Y)) # sequence the dataset if sequence_number is not None: self.sequence(sequence_number=sequence_number, rng=rng) # make optional binary if binary: log.debug('Making MNIST X values binary with cutoff %s', str(binary_cutoff)) self.train_X = binarize(self.train_X, binary_cutoff) self.valid_X = binarize(self.valid_X, binary_cutoff) self.test_X = binarize(self.test_X, binary_cutoff) # make optional one-hot labels if one_hot: self.train_Y = numpy_one_hot(self.train_Y, n_classes=10) self.valid_Y = numpy_one_hot(self.valid_Y, n_classes=10) self.test_Y = numpy_one_hot(self.test_Y, n_classes=10) log.debug("loading datasets into shared variables") self.train_X = dataset_shared(self.train_X, name='mnist_train_x', borrow=True) self.train_Y = dataset_shared(self.train_Y, name='mnist_train_y', borrow=True) self.valid_X = dataset_shared(self.valid_X, name='mnist_valid_x', borrow=True) self.valid_Y = dataset_shared(self.valid_Y, name='mnist_valid_y', borrow=True) self.test_X = dataset_shared(self.test_X, name='mnist_test_x', borrow=True) self.test_Y = dataset_shared(self.test_Y, name='mnist_test_y', borrow=True)
def __init__(self, binary=False, binary_cutoff=0.5, one_hot=False, concat_train_valid=False, dataset_dir='../../datasets', sequence_number=0, rng=None): """ Parameters ---------- binary : bool, optional Flag to binarize the input images. binary_cutoff : float, optional If you want to binarize the input images, what threshold value to use. one_hot : bool, optional Flag to convert the labels to one-hot encoding rather than their normal integers. concat_train_valid : bool, optional Flag to concatenate the training and validation datasets together. This would be the original split. dataset_dir : str, optional The `dataset_dir` parameter to a ``FileDataset``. sequence_number : int, optional The sequence method to use if we want to put the input images into a specific order. 0 defaults to random. rng : random, optional The random number generator to use when sequencing. """ # instantiate the Dataset class to install the dataset from the url log.info('Loading MNIST with binary=%s and one_hot=%s', str(binary), str(one_hot)) filename = 'mnist.pkl.gz' source = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' super(MNIST, self).__init__(filename=filename, source=source, dataset_dir=dataset_dir) # self.dataset_location now contains the os path to the dataset file # self.file_type tells how to load the dataset # load the dataset into memory if self.file_type is file_ops.GZ: (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle.load( gzip.open(self.dataset_location, 'rb') ) else: (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle.load( open(self.dataset_location, 'r') ) if concat_train_valid: log.debug('Concatenating train and valid sets together...') self.train_X = numpy.concatenate((self.train_X, self.valid_X)) self.train_Y = numpy.concatenate((self.train_Y, self.valid_Y)) # sequence the dataset if sequence_number is not None: self.sequence(sequence_number=sequence_number, rng=rng) # make optional binary if binary: log.debug('Making MNIST X values binary with cutoff %s', str(binary_cutoff)) self.train_X = binarize(self.train_X, binary_cutoff) self.valid_X = binarize(self.valid_X, binary_cutoff) self.test_X = binarize(self.test_X, binary_cutoff) # make optional one-hot labels if one_hot: self.train_Y = numpy_one_hot(self.train_Y, n_classes=10) self.valid_Y = numpy_one_hot(self.valid_Y, n_classes=10) self.test_Y = numpy_one_hot(self.test_Y, n_classes=10) log.debug("loading datasets into shared variables") self.train_X = dataset_shared(self.train_X, name='mnist_train_x', borrow=True) self.train_Y = dataset_shared(self.train_Y, name='mnist_train_y', borrow=True) self.valid_X = dataset_shared(self.valid_X, name='mnist_valid_x', borrow=True) self.valid_Y = dataset_shared(self.valid_Y, name='mnist_valid_y', borrow=True) self.test_X = dataset_shared(self.test_X, name='mnist_test_x', borrow=True) self.test_Y = dataset_shared(self.test_Y, name='mnist_test_y', borrow=True)
def __init__(self, source_dir, size_key, one_hot=False, train_split=0.9, valid_split=0.1): print("Getting dataset %s" % size_key) # grab the datasets from the preprocessed files datasets = [(numpy.load(f), get_label(f)) for f in find_processed_files(source_dir, size_key)] # make sure they are all the correct dimensionality datasets = [(data.shape, data, label) for data, label in datasets if data.shape[1] == sizes[size_key] and label is not None] print("Found %d examples" % len(datasets)) # shuffle! random.seed(1) random.shuffle(datasets) # shapes shapes = [shape for shape, _, _ in datasets] # data dataset = [data for _, data, _ in datasets] # labels labels = numpy.asarray([label for _, _, label in datasets], dtype='int8') # make the labels into one-hot vectors if one_hot: labels = numpy_one_hot(labels, n_classes=5) train_len = int(math.floor(train_split * len(dataset))) valid_len = int(math.floor(valid_split * len(dataset))) valid_stop = train_len + valid_len print("# train: %d examples" % train_len) train_datasets = dataset[:train_len] train_labels = labels[:train_len] if valid_len > 0: valid_datasets = dataset[train_len:valid_stop] valid_labels = labels[train_len:valid_stop] else: valid_datasets = [] if train_len + valid_len < len(dataset): test_datasets = dataset[valid_stop:] test_labels = labels[valid_stop:] else: test_datasets = [] # median_train_len = int(numpy.median(numpy.asarray(shapes[:train_len]), axis=0)[0]) min_train_len = int(numpy.min(numpy.asarray(shapes[:train_len]), axis=0)[0]) max_train_len = int(numpy.max(numpy.asarray(shapes[:train_len]), axis=0)[0]) # train = numpy.array([data[:min_train_len] for data in train_datasets], dtype='float32') train = numpy.array( [numpy.pad(data, [(0, max_train_len-data.shape[0]), (0, 0)], mode='constant') for data in train_datasets], dtype='float32' ) self.train_shape = train.shape self.train = (dataset_shared(train, borrow=True), dataset_shared(train_labels, borrow=True)) if len(valid_datasets) > 0: min_valid_len = int(numpy.min(numpy.asarray(shapes[train_len:valid_stop]), axis=0)[0]) max_valid_len = int(numpy.max(numpy.asarray(shapes[train_len:valid_stop]), axis=0)[0]) # valid = numpy.array([data[:min_valid_len] for data in valid_datasets], dtype='float32') valid = numpy.array( [numpy.pad(data, [(0, max_valid_len-data.shape[0]), (0, 0)], mode='constant') for data in valid_datasets], dtype='float32' ) self.valid_shape = valid.shape self.valid = (dataset_shared(valid, borrow=True), dataset_shared(valid_labels, borrow=True)) else: self.valid = None, None self.valid_shape = None if len(test_datasets) > 0: min_test_len = int(numpy.min(numpy.asarray(shapes[valid_stop:]), axis=0)[0]) max_test_len = int(numpy.max(numpy.asarray(shapes[valid_stop:]), axis=0)[0]) # valid = numpy.array([data[:min_test_len] for data in test_datasets], dtype='float32') test = numpy.array( [numpy.pad(data, [(0, max_test_len - data.shape[0]), (0, 0)], mode='constant') for data in test_datasets], dtype='float32' ) self.test_shape = test.shape self.test = (dataset_shared(test, borrow=True), dataset_shared(test_labels, borrow=True)) else: self.test = None, None self.test_shape = None print("Train shape: %s" % str(self.train_shape)) print("Valid shape: %s" % str(self.valid_shape)) print("Test shape: %s" % str(self.test_shape)) print("Dataset %s initialized!" % size_key)