Beispiel #1
0
    def __init__(self, dataset_dir='../../datasets'):
        filename = 'Nottingham.zip'
        source = 'http://www-etud.iro.umontreal.ca/~boulanni/Nottingham.zip'

        super(Nottingham, self).__init__(filename=filename, source=source, dataset_dir=dataset_dir)

        # now the file path has been installed to self.dataset_locations directory
        # grab the appropriate filenames
        train_filenames = os.path.join(self.dataset_location, 'Nottingham', 'train', '*.mid')
        valid_filenames = os.path.join(self.dataset_location, 'Nottingham', 'valid', '*.mid')
        test_filenames = os.path.join(self.dataset_location, 'Nottingham', 'test', '*.mid')
        # glob the files
        train_files = glob.glob(train_filenames)
        valid_files = glob.glob(valid_filenames)
        test_files = glob.glob(test_filenames)
        # grab the datasets from midireading the files
        train_datasets = [midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(theano.config.floatX) for f in train_files]
        valid_datasets = [midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(theano.config.floatX) for f in valid_files]
        test_datasets = [midiread(f, r=(21, 109), dt=0.3).piano_roll.astype(theano.config.floatX) for f in test_files]
        # get the data shapes
        self.train_shapes = [train.shape for train in train_datasets]
        self.valid_shapes = [valid.shape for valid in valid_datasets]
        self.test_shapes = [test.shape for test in test_datasets]
        # put them into shared variables
        log.debug('Putting Nottingham into theano shared variables')
        self.train = dataset_shared(numpy.concatenate(train_datasets), name='nottingham_train', borrow=True)
        self.valid = dataset_shared(numpy.concatenate(valid_datasets), name='nottingham_valid', borrow=True)
        self.test = dataset_shared(numpy.concatenate(test_datasets), name='nottingham_test', borrow=True)
Beispiel #2
0
    def __init__(self, dataset_dir='../../datasets'):
        """
        Parameters
        ----------
        dataset_dir : str
            The `dataset_dir` parameter to a ``FileDataset``.
        """
        log.debug("Loading MuseData midi dataset...")

        filename = 'MuseData.zip'
        source = 'http://www-etud.iro.umontreal.ca/~boulanni/MuseData.zip'

        super(MuseData, self).__init__(filename=filename,
                                       source=source,
                                       dataset_dir=dataset_dir)

        # now the file path has been installed to self.dataset_locations directory
        # grab the appropriate filenames
        train_filenames = os.path.join(self.dataset_location, 'MuseData',
                                       'train', '*.mid')
        valid_filenames = os.path.join(self.dataset_location, 'MuseData',
                                       'valid', '*.mid')
        test_filenames = os.path.join(self.dataset_location, 'MuseData',
                                      'test', '*.mid')
        # glob the files
        train_files = glob.glob(train_filenames)
        valid_files = glob.glob(valid_filenames)
        test_files = glob.glob(test_filenames)
        # grab the datasets from midireading the files
        train_datasets = [
            midiread(f, r=(21, 109),
                     dt=0.3).piano_roll.astype(theano.config.floatX)
            for f in train_files
        ]
        valid_datasets = [
            midiread(f, r=(21, 109),
                     dt=0.3).piano_roll.astype(theano.config.floatX)
            for f in valid_files
        ]
        test_datasets = [
            midiread(f, r=(21, 109),
                     dt=0.3).piano_roll.astype(theano.config.floatX)
            for f in test_files
        ]
        # get the data shapes
        self.train_shapes = [train.shape for train in train_datasets]
        self.valid_shapes = [valid.shape for valid in valid_datasets]
        self.test_shapes = [test.shape for test in test_datasets]
        # put them into shared variables
        log.debug('Putting MuseData into theano shared variables')
        self.train = dataset_shared(numpy.concatenate(train_datasets),
                                    name='muse_train',
                                    borrow=True)
        self.valid = dataset_shared(numpy.concatenate(valid_datasets),
                                    name='muse_valid',
                                    borrow=True)
        self.test = dataset_shared(numpy.concatenate(test_datasets),
                                   name='muse_test',
                                   borrow=True)
Beispiel #3
0
    def __init__(self, binary=False, one_hot=False, concat_train_valid=True,
                 dataset_dir='../../datasets', sequence_number=0, rng=None):
        # instantiate the Dataset class to install the dataset from the url
        log.info('Loading MNIST with binary=%s and one_hot=%s', str(binary), str(one_hot))

        filename = 'mnist.pkl.gz'
        source = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'

        super(MNIST, self).__init__(filename=filename, source=source, dataset_dir=dataset_dir)

        # self.dataset_location now contains the os path to the dataset file
        # self.file_type tells how to load the dataset
        # load the dataset into memory
        if PY3:
            pickle_load = partial(pickle.load, encoding='bytes')
        else:
            pickle_load = partial(pickle.load)

        if self.file_type is file_ops.GZ:
            (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle_load(
                gzip.open(self.dataset_location, 'rb')
            )
        else:
            (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle_load(
                open(self.dataset_location, 'r')
            )

        if concat_train_valid:
            log.debug('Concatenating train and valid sets together...')
            self.train_X = numpy.concatenate((self.train_X, self.valid_X))
            self.train_Y = numpy.concatenate((self.train_Y, self.valid_Y))

        # sequence the dataset
        if sequence_number is not None:
            self.sequence(sequence_number=sequence_number, rng=rng)

        # make optional binary
        if binary:
            _binary_cutoff = 0.5
            log.debug('Making MNIST X values binary with cutoff %s', str(_binary_cutoff))
            self.train_X = (self.train_X > _binary_cutoff).astype('float32')
            self.valid_X = (self.valid_X > _binary_cutoff).astype('float32')
            self.test_X  = (self.test_X > _binary_cutoff).astype('float32')

        # make optional one-hot labels
        if one_hot:
            self.train_Y = numpy_one_hot(self.train_Y, n_classes=10)
            self.valid_Y = numpy_one_hot(self.valid_Y, n_classes=10)
            self.test_Y  = numpy_one_hot(self.test_Y, n_classes=10)

        log.debug("loading datasets into shared variables")
        self.train_X = dataset_shared(self.train_X, name='mnist_train_x', borrow=True)
        self.train_Y = dataset_shared(self.train_Y, name='mnist_train_y', borrow=True)

        self.valid_X = dataset_shared(self.valid_X, name='mnist_valid_x', borrow=True)
        self.valid_Y = dataset_shared(self.valid_Y, name='mnist_valid_y', borrow=True)

        self.test_X = dataset_shared(self.test_X, name='mnist_test_x', borrow=True)
        self.test_Y = dataset_shared(self.test_Y, name='mnist_test_y', borrow=True)
Beispiel #4
0
    def __init__(self, train_X, train_Y=None, valid_X=None, valid_Y=None, test_X=None, test_Y=None):
        log.info('Wrapping matrix from memory')
        super(self.__class__, self).__init__()

        # make sure the inputs are arrays
        train_X = numpy.array(train_X)
        self._train_shape = train_X.shape
        self.train_X = dataset_shared(train_X, name='memory_train_x', borrow=True)
        if train_Y is not None:
            try:
                self.train_Y = dataset_shared(numpy.array(train_Y), name='memory_train_y', borrow=True)
            except Exception as e:
                log.exception("COULD NOT CONVERT train_Y TO NUMPY ARRAY. EXCEPTION: %s", str(e))

        if valid_X is not None:
            try:
                valid_X = numpy.array(valid_X)
                self._valid_shape = valid_X.shape
                self.valid_X = dataset_shared(valid_X, name='memory_valid_x', borrow=True)
            except Exception as e:
                log.exception("COULD NOT CONVERT valid_X TO NUMPY ARRAY. EXCEPTION: %s", str(e))
        if valid_Y is not None:
            try:
                self.valid_Y = dataset_shared(numpy.array(valid_Y), name='memory_valid_y', borrow=True)
            except Exception as e:
                log.exception("COULD NOT CONVERT valid_Y TO NUMPY ARRAY. EXCEPTION: %s", str(e))

        if test_X is not None:
            try:
                test_X = numpy.array(test_X)
                self._test_shape = test_X.shape
                self.test_X = dataset_shared(test_X, name='memory_test_x', borrow=True)
            except Exception as e:
                log.exception("COULD NOT CONVERT test_X TO NUMPY ARRAY. EXCEPTION: %s", str(e))
        if test_Y is not None:
            try:
                self.test_Y = dataset_shared(numpy.array(test_Y), name='memory_test_y', borrow=True)
            except Exception as e:
                log.exception("COULD NOT CONVERT test_Y TO NUMPY ARRAY. EXCEPTION: %s", str(e))
Beispiel #5
0
    def __init__(self, train_X, train_Y=None, valid_X=None, valid_Y=None, test_X=None, test_Y=None):
        log.info('Wrapping matrix from memory')
        # make sure the inputs are arrays
        train_X = numpy.array(train_X)
        self._train_shape = train_X.shape
        self.train_X = dataset_shared(train_X, name='memory_train_x', borrow=True)
        if train_Y is not None:
            try:
                self.train_Y = dataset_shared(numpy.array(train_Y), name='memory_train_y', borrow=True)
            except Exception as e:
                log.exception("COULD NOT CONVERT train_Y TO NUMPY ARRAY. EXCEPTION: %s", str(e))

        if valid_X is not None:
            try:
                valid_X = numpy.array(valid_X)
                self._valid_shape = valid_X.shape
                self.valid_X = dataset_shared(valid_X, name='memory_valid_x', borrow=True)
            except Exception as e:
                log.exception("COULD NOT CONVERT valid_X TO NUMPY ARRAY. EXCEPTION: %s", str(e))
        if valid_Y is not None:
            try:
                self.valid_Y = dataset_shared(numpy.array(valid_Y), name='memory_valid_y', borrow=True)
            except Exception as e:
                log.exception("COULD NOT CONVERT valid_Y TO NUMPY ARRAY. EXCEPTION: %s", str(e))

        if test_X is not None:
            try:
                test_X = numpy.array(test_X)
                self._test_shape = test_X.shape
                self.test_X = dataset_shared(test_X, name='memory_test_x', borrow=True)
            except Exception as e:
                log.exception("COULD NOT CONVERT test_X TO NUMPY ARRAY. EXCEPTION: %s", str(e))
        if test_Y is not None:
            try:
                self.test_Y = dataset_shared(numpy.array(test_Y), name='memory_test_y', borrow=True)
            except Exception as e:
                log.exception("COULD NOT CONVERT test_Y TO NUMPY ARRAY. EXCEPTION: %s", str(e))
Beispiel #6
0
    def __init__(self,
                 binary=False,
                 binary_cutoff=0.5,
                 one_hot=False,
                 concat_train_valid=False,
                 dataset_dir='../../datasets',
                 sequence_number=0,
                 rng=None):
        """
        Parameters
        ----------
        binary : bool, optional
            Flag to binarize the input images.
        binary_cutoff : float, optional
            If you want to binarize the input images, what threshold value to use.
        one_hot : bool, optional
            Flag to convert the labels to one-hot encoding rather than their normal integers.
        concat_train_valid : bool, optional
            Flag to concatenate the training and validation datasets together. This would be the original split.
        dataset_dir : str, optional
            The `dataset_dir` parameter to a ``FileDataset``.
        sequence_number : int, optional
            The sequence method to use if we want to put the input images into a specific order. 0 defaults to random.
        rng : random, optional
            The random number generator to use when sequencing.
        """
        # instantiate the Dataset class to install the dataset from the url
        log.info('Loading MNIST with binary=%s and one_hot=%s', str(binary),
                 str(one_hot))

        filename = 'mnist.pkl.gz'
        source = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'

        super(MNIST, self).__init__(filename=filename,
                                    source=source,
                                    dataset_dir=dataset_dir)

        # self.dataset_location now contains the os path to the dataset file
        # self.file_type tells how to load the dataset
        # load the dataset into memory
        if self.file_type is file_ops.GZ:
            (self.train_X,
             self.train_Y), (self.valid_X,
                             self.valid_Y), (self.test_X,
                                             self.test_Y) = pickle.load(
                                                 gzip.open(
                                                     self.dataset_location,
                                                     'rb'))
        else:
            (self.train_X,
             self.train_Y), (self.valid_X,
                             self.valid_Y), (self.test_X,
                                             self.test_Y) = pickle.load(
                                                 open(self.dataset_location,
                                                      'r'))

        if concat_train_valid:
            log.debug('Concatenating train and valid sets together...')
            self.train_X = numpy.concatenate((self.train_X, self.valid_X))
            self.train_Y = numpy.concatenate((self.train_Y, self.valid_Y))

        # sequence the dataset
        if sequence_number is not None:
            self.sequence(sequence_number=sequence_number, rng=rng)

        # make optional binary
        if binary:
            log.debug('Making MNIST X values binary with cutoff %s',
                      str(binary_cutoff))
            self.train_X = binarize(self.train_X, binary_cutoff)
            self.valid_X = binarize(self.valid_X, binary_cutoff)
            self.test_X = binarize(self.test_X, binary_cutoff)

        # make optional one-hot labels
        if one_hot:
            self.train_Y = numpy_one_hot(self.train_Y, n_classes=10)
            self.valid_Y = numpy_one_hot(self.valid_Y, n_classes=10)
            self.test_Y = numpy_one_hot(self.test_Y, n_classes=10)

        log.debug("loading datasets into shared variables")
        self.train_X = dataset_shared(self.train_X,
                                      name='mnist_train_x',
                                      borrow=True)
        self.train_Y = dataset_shared(self.train_Y,
                                      name='mnist_train_y',
                                      borrow=True)

        self.valid_X = dataset_shared(self.valid_X,
                                      name='mnist_valid_x',
                                      borrow=True)
        self.valid_Y = dataset_shared(self.valid_Y,
                                      name='mnist_valid_y',
                                      borrow=True)

        self.test_X = dataset_shared(self.test_X,
                                     name='mnist_test_x',
                                     borrow=True)
        self.test_Y = dataset_shared(self.test_Y,
                                     name='mnist_test_y',
                                     borrow=True)
Beispiel #7
0
    def __init__(self, binary=False, binary_cutoff=0.5, one_hot=False, concat_train_valid=False,
                 dataset_dir='../../datasets', sequence_number=0, rng=None):
        """
        Parameters
        ----------
        binary : bool, optional
            Flag to binarize the input images.
        binary_cutoff : float, optional
            If you want to binarize the input images, what threshold value to use.
        one_hot : bool, optional
            Flag to convert the labels to one-hot encoding rather than their normal integers.
        concat_train_valid : bool, optional
            Flag to concatenate the training and validation datasets together. This would be the original split.
        dataset_dir : str, optional
            The `dataset_dir` parameter to a ``FileDataset``.
        sequence_number : int, optional
            The sequence method to use if we want to put the input images into a specific order. 0 defaults to random.
        rng : random, optional
            The random number generator to use when sequencing.
        """
        # instantiate the Dataset class to install the dataset from the url
        log.info('Loading MNIST with binary=%s and one_hot=%s', str(binary), str(one_hot))

        filename = 'mnist.pkl.gz'
        source = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'

        super(MNIST, self).__init__(filename=filename, source=source, dataset_dir=dataset_dir)

        # self.dataset_location now contains the os path to the dataset file
        # self.file_type tells how to load the dataset
        # load the dataset into memory
        if self.file_type is file_ops.GZ:
            (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle.load(
                gzip.open(self.dataset_location, 'rb')
            )
        else:
            (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle.load(
                open(self.dataset_location, 'r')
            )

        if concat_train_valid:
            log.debug('Concatenating train and valid sets together...')
            self.train_X = numpy.concatenate((self.train_X, self.valid_X))
            self.train_Y = numpy.concatenate((self.train_Y, self.valid_Y))

        # sequence the dataset
        if sequence_number is not None:
            self.sequence(sequence_number=sequence_number, rng=rng)

        # make optional binary
        if binary:
            log.debug('Making MNIST X values binary with cutoff %s', str(binary_cutoff))
            self.train_X = binarize(self.train_X, binary_cutoff)
            self.valid_X = binarize(self.valid_X, binary_cutoff)
            self.test_X  = binarize(self.test_X, binary_cutoff)

        # make optional one-hot labels
        if one_hot:
            self.train_Y = numpy_one_hot(self.train_Y, n_classes=10)
            self.valid_Y = numpy_one_hot(self.valid_Y, n_classes=10)
            self.test_Y  = numpy_one_hot(self.test_Y, n_classes=10)

        log.debug("loading datasets into shared variables")
        self.train_X = dataset_shared(self.train_X, name='mnist_train_x', borrow=True)
        self.train_Y = dataset_shared(self.train_Y, name='mnist_train_y', borrow=True)

        self.valid_X = dataset_shared(self.valid_X, name='mnist_valid_x', borrow=True)
        self.valid_Y = dataset_shared(self.valid_Y, name='mnist_valid_y', borrow=True)

        self.test_X = dataset_shared(self.test_X, name='mnist_test_x', borrow=True)
        self.test_Y = dataset_shared(self.test_Y, name='mnist_test_y', borrow=True)
Beispiel #8
0
    def __init__(self, source_dir, size_key, one_hot=False, train_split=0.9, valid_split=0.1):
        print("Getting dataset %s" % size_key)
        # grab the datasets from the preprocessed files
        datasets = [(numpy.load(f), get_label(f)) for f in find_processed_files(source_dir, size_key)]
        # make sure they are all the correct dimensionality
        datasets = [(data.shape, data, label) for data, label in datasets
                    if data.shape[1] == sizes[size_key] and label is not None]

        print("Found %d examples" % len(datasets))

        # shuffle!
        random.seed(1)
        random.shuffle(datasets)

        # shapes
        shapes = [shape for shape, _, _ in datasets]
        # data
        dataset = [data for _, data, _ in datasets]
        # labels
        labels = numpy.asarray([label for _, _, label in datasets], dtype='int8')
        # make the labels into one-hot vectors
        if one_hot:
            labels = numpy_one_hot(labels, n_classes=5)

        train_len = int(math.floor(train_split * len(dataset)))
        valid_len = int(math.floor(valid_split * len(dataset)))
        valid_stop = train_len + valid_len
        print("# train: %d examples" % train_len)

        train_datasets = dataset[:train_len]
        train_labels = labels[:train_len]

        if valid_len > 0:
            valid_datasets = dataset[train_len:valid_stop]
            valid_labels = labels[train_len:valid_stop]
        else:
            valid_datasets = []

        if train_len + valid_len < len(dataset):
            test_datasets = dataset[valid_stop:]
            test_labels = labels[valid_stop:]
        else:
            test_datasets = []

        # median_train_len = int(numpy.median(numpy.asarray(shapes[:train_len]), axis=0)[0])
        min_train_len = int(numpy.min(numpy.asarray(shapes[:train_len]), axis=0)[0])
        max_train_len = int(numpy.max(numpy.asarray(shapes[:train_len]), axis=0)[0])
        # train = numpy.array([data[:min_train_len] for data in train_datasets], dtype='float32')
        train = numpy.array(
            [numpy.pad(data, [(0, max_train_len-data.shape[0]), (0, 0)], mode='constant') for data in train_datasets],
            dtype='float32'
        )
        self.train_shape = train.shape
        self.train = (dataset_shared(train, borrow=True), dataset_shared(train_labels, borrow=True))

        if len(valid_datasets) > 0:
            min_valid_len = int(numpy.min(numpy.asarray(shapes[train_len:valid_stop]), axis=0)[0])
            max_valid_len = int(numpy.max(numpy.asarray(shapes[train_len:valid_stop]), axis=0)[0])
            # valid = numpy.array([data[:min_valid_len] for data in valid_datasets], dtype='float32')
            valid = numpy.array(
                [numpy.pad(data, [(0, max_valid_len-data.shape[0]), (0, 0)], mode='constant') for data in
                 valid_datasets],
                dtype='float32'
            )
            self.valid_shape = valid.shape
            self.valid = (dataset_shared(valid, borrow=True), dataset_shared(valid_labels, borrow=True))
        else:
            self.valid = None, None
            self.valid_shape = None

        if len(test_datasets) > 0:
            min_test_len = int(numpy.min(numpy.asarray(shapes[valid_stop:]), axis=0)[0])
            max_test_len = int(numpy.max(numpy.asarray(shapes[valid_stop:]), axis=0)[0])
            # valid = numpy.array([data[:min_test_len] for data in test_datasets], dtype='float32')
            test = numpy.array(
                [numpy.pad(data, [(0, max_test_len - data.shape[0]), (0, 0)], mode='constant') for data in
                 test_datasets],
                dtype='float32'
            )
            self.test_shape = test.shape
            self.test = (dataset_shared(test, borrow=True), dataset_shared(test_labels, borrow=True))
        else:
            self.test = None, None
            self.test_shape = None


        print("Train shape: %s" % str(self.train_shape))
        print("Valid shape: %s" % str(self.valid_shape))
        print("Test shape: %s" % str(self.test_shape))
        print("Dataset %s initialized!" % size_key)