Example #1
0
    def __init__(self, binary=False, one_hot=False, concat_train_valid=True,
                 dataset_dir='../../datasets', sequence_number=0, rng=None):
        # instantiate the Dataset class to install the dataset from the url
        log.info('Loading MNIST with binary=%s and one_hot=%s', str(binary), str(one_hot))

        filename = 'mnist.pkl.gz'
        source = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'

        super(MNIST, self).__init__(filename=filename, source=source, dataset_dir=dataset_dir)

        # self.dataset_location now contains the os path to the dataset file
        # self.file_type tells how to load the dataset
        # load the dataset into memory
        if PY3:
            pickle_load = partial(pickle.load, encoding='bytes')
        else:
            pickle_load = partial(pickle.load)

        if self.file_type is file_ops.GZ:
            (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle_load(
                gzip.open(self.dataset_location, 'rb')
            )
        else:
            (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle_load(
                open(self.dataset_location, 'r')
            )

        if concat_train_valid:
            log.debug('Concatenating train and valid sets together...')
            self.train_X = numpy.concatenate((self.train_X, self.valid_X))
            self.train_Y = numpy.concatenate((self.train_Y, self.valid_Y))

        # sequence the dataset
        if sequence_number is not None:
            self.sequence(sequence_number=sequence_number, rng=rng)

        # make optional binary
        if binary:
            _binary_cutoff = 0.5
            log.debug('Making MNIST X values binary with cutoff %s', str(_binary_cutoff))
            self.train_X = (self.train_X > _binary_cutoff).astype('float32')
            self.valid_X = (self.valid_X > _binary_cutoff).astype('float32')
            self.test_X  = (self.test_X > _binary_cutoff).astype('float32')

        # make optional one-hot labels
        if one_hot:
            self.train_Y = numpy_one_hot(self.train_Y, n_classes=10)
            self.valid_Y = numpy_one_hot(self.valid_Y, n_classes=10)
            self.test_Y  = numpy_one_hot(self.test_Y, n_classes=10)

        log.debug("loading datasets into shared variables")
        self.train_X = dataset_shared(self.train_X, name='mnist_train_x', borrow=True)
        self.train_Y = dataset_shared(self.train_Y, name='mnist_train_y', borrow=True)

        self.valid_X = dataset_shared(self.valid_X, name='mnist_valid_x', borrow=True)
        self.valid_Y = dataset_shared(self.valid_Y, name='mnist_valid_y', borrow=True)

        self.test_X = dataset_shared(self.test_X, name='mnist_test_x', borrow=True)
        self.test_Y = dataset_shared(self.test_Y, name='mnist_test_y', borrow=True)
Example #2
0
 def setUp(self):
     # numpy array to test
     self.np = numpy.eye(10)
     # generator over word vectors to test
     words = "Hello\nWorld\nThis\nIs\nA\nTest!".split("\n")
     vocab = {char: n for char, n in zip(list(set(words)), range(len(set(words))))}
     words = [vocab[x] for x in words]
     self.words = numpy_one_hot(words, n_classes=len(vocab))
Example #3
0
 def setUp(self):
     # numpy array to test
     self.np = numpy.eye(10)
     # generator over word vectors to test
     words = "Hello\nWorld\nThis\nIs\nA\nTest!".split("\n")
     vocab = {
         char: n
         for char, n in zip(list(set(words)), range(len(set(words))))
     }
     words = [vocab[x] for x in words]
     self.words = numpy_one_hot(words, n_classes=len(vocab))
Example #4
0
def __process_str(data_str, vocab):
    # process the raw input data string
    data = []
    for data_char in data_str:
        data.append(vocab.get(data_char, 0))

    data = numpy_one_hot(numpy.asarray(data), n_classes=numpy.amax(vocab.values()) + 1)

    seq, dim = data.shape
    data = numpy.reshape(data, (1, seq, dim))

    return data
def string_to_data(query):
    vocab = pickle.load(open('vocab.pkl', 'rb'))
    # process the raw input data string
    data = []
    # get the integer encodings
    for data_char in query:
        data.append(vocab.get(data_char, 0))

    # convert the integers to one-hot arrays
    data = numpy_one_hot(numpy.asarray(data), n_classes=numpy.amax(vocab.values()) + 1)

    # make 3D for model input
    seq, dim = data.shape
    data = numpy.reshape(data, (1, seq, dim))

    return data
Example #6
0
    def __init__(self, path, source=None, train_filter=None, valid_filter=None, test_filter=None,
                 inputs_preprocess=None, targets_preprocess=None,
                 vocab=None, label_vocab=None, unk_token="<UNK>", level="char", target_n_future=None,
                 sequence_length=False):
        """
        Initialize a text-based dataset.

        Parameters
        ----------
        path : str
            The name of the file or directory for the dataset.
        source : str, optional
            The URL path for downloading the dataset (if applicable).
        train_filter : regex string or compiled regex object, optional
            The regular expression filter to match training file names against (if applicable).
        valid_filter : regex string or compiled regex object, optional
            The regular expression filter to match validation file names against (if applicable).
        test_filter : regex string or compiled regex object, optional
            The regular expression filter to match testing file names against (if applicable).
        inputs_preprocess : function, optional
            A preprocessing function to apply to input data. This function will be applied to each line
            from the files in `path`, and if it creates a list of elements, each element will be yielded as the
            input data separately. For example, the function could be ``lambda line: (line.split(',')[0]).lower()``
            to grab a string before a comma on each line and lowercase it. Preprocessing will happen before any
            tokenization is applied i.e. tokenizing and processing are composed as tokenize(preprocess(line)).
        targets_preprocess : function, optional
            A preprocessing function to apply to targets data. This function will be applied to each line from
            the files in `path`, and if it creates a list of elements, each element will be yielded as the target
            label data separately. For example, the function could be ``lambda line: (line.split(',')[1]).lower()``
            to grab a label after a comma on each line and lowercase it. Tokenization will not be applied to data
            yielded from the targets' preprocessing.
        vocab : dict, optional
            A starting dictionary to use when converting tokens to numbers.
        label_vocab : dict, optional
            A starting dictionary to use when converting labels (targets) to numbers.
        unk_token : str
            The representation for an unknown token to use in the vocab dictionary.
        level : str
            Either ``char``, ``word``, or ``line``, saying how to process the text.
            For ``char``, data will be character-level.
            For ``word``, data will be split by whitespace.
            For ``line``, data will be split by newline.
        target_n_future : int, optional
            For creating language models that predict tokens in the future, this determines the skip size (number of
            steps in the future) that the language model will try to predict as its target. Most language models will
            have target_n_future=1. If `target_n_future` is not None, the targets will be created from the inputs
            (but still apply targets_preprocess instead of inputs_preprocess if it is different).
        sequence_length : int, optional
            The maximum length of subsequences to iterate over this dataset. If this is None or False, the data
            will just be supplied as a stream of one-hot vectors rather than broken into 2-D one-hot vector sequences.
        """
        # Figure out if we want characters, words, or lines processed, and create the processing function
        # to compose on top of the preprocessing function arguments.
        level = level.lower()
        if level == "char":
            tokenize = lambda s: list(s)
        elif level == "word":
            if NLTK_AVAILABLE:
                tokenize = lambda s: nltk.tokenize.word_tokenize(s)
            else:
                warnings.warn("NLTK isn't installed - going to split strings by whitespace. Highly recommended "
                              "that you install nltk for better word tokenization.")
                tokenize = lambda s: s.split()
        elif level == "line":
            tokenize = lambda s: [s]
        else:
            tokenize = None

        if sequence_length:
            assert sequence_length > 1, "Need to have a sequence_length greater than 1, found %d" % sequence_length
        self.sequence_len = sequence_length

        # modify our file stream's processors to work with the appropriate level!
        # if target_n_future is not none, we are assuming that this is a language model and that we should tokenize the target
        if target_n_future is not None:
            targets_preprocess = compose(tokenize, inputs_preprocess)
        inputs_preprocess = compose(tokenize, inputs_preprocess)

        # call super to create the data streams
        super(TextDataset, self).__init__(path=path, source=source,
                                          train_filter=train_filter, valid_filter=valid_filter, test_filter=test_filter,
                                          inputs_preprocess=inputs_preprocess, targets_preprocess=targets_preprocess)
        # after this call, train_inputs, train_targets, etc. are all lists or None.

        # determine if this is a language model, and adjust the stream accordingly to use the inputs as the targets
        if target_n_future is not None:
            self.train_targets = FileStream(path, train_filter, targets_preprocess, target_n_future)
            if valid_filter is not None:
                self.valid_targets = FileStream(path, valid_filter, targets_preprocess, target_n_future)
            if test_filter is not None:
                self.test_targets = FileStream(path, test_filter, targets_preprocess, target_n_future)

        # Create our vocab dictionary if it doesn't exist!
        self.unk_token = unk_token
        vocab_inputs = [self.train_inputs] + (self.valid_inputs or [])
        self.vocab = vocab or self.compile_vocab(itertools.chain(*vocab_inputs))
        vocab_len = len(self.vocab)
        self.vocab_inverse = {v: k for k, v in self.vocab.items()}

        # Now modify our various inputs streams with one-hot versions using the vocab dictionary.
        # (making sure they remain as lists to satisfy the superclass condition)
        rep = lambda token: self.vocab.get(token, self.vocab.get(self.unk_token))
        one_hot = lambda token: numpy_one_hot([rep(token)], n_classes=vocab_len)[0]
        self.train_inputs = ModifyStream(self.train_inputs, one_hot)
        if self.sequence_len:
            self.train_inputs = self._subsequence(self.train_inputs)
        if self.valid_inputs is not None:
            self.valid_inputs = ModifyStream(self.valid_inputs, one_hot)
            if self.sequence_len:
                self.valid_inputs = self._subsequence(self.valid_inputs)
        if self.test_inputs is not None:
            self.test_inputs = ModifyStream(self.test_inputs, one_hot)
            if self.sequence_len:
                self.valid_inputs = self._subsequence(self.valid_inputs)

        # Now deal with possible output streams (either tokenizing it using the supplied label dictionary,
        # creating the label dictionary, or using the vocab dictionary if it is a language model (target_n_future is not none)
        if self.train_targets is not None and target_n_future is None:
            vocab_inputs = [self.train_targets] + (self.valid_targets or [])
            self.label_vocab = label_vocab or \
                               self.compile_vocab(itertools.chain(*vocab_inputs))
            self.label_vocab_inverse = {v: k for k, v in self.label_vocab.items()}
        # if this is a language model, label vocab is same as input vocab
        elif target_n_future is not None:
            self.label_vocab = self.vocab
            self.label_vocab_inverse = self.vocab_inverse
        else:
            self.label_vocab = None
            self.label_vocab_inverse = None

        # now modify the output streams with the one-hot representation using the vocab (making sure they remain
        # as lists to satisfy the superclass condition)
        if self.label_vocab is not None:
            label_vocab_len = len(self.label_vocab)
            label_rep = lambda token: self.label_vocab.get(token, self.label_vocab.get(self.unk_token))
            label_one_hot = lambda token: numpy_one_hot([label_rep(token)], n_classes=label_vocab_len)[0]
            if self.train_targets is not None:
                self.train_targets = ModifyStream(self.train_targets, label_one_hot)
                if self.sequence_len:
                    self.train_targets = self._subsequence(self.train_targets)
            if self.valid_targets is not None:
                self.valid_targets = ModifyStream(self.valid_targets, label_one_hot)
                if self.sequence_len:
                    self.valid_targets = self._subsequence(self.valid_targets)
            if self.test_targets is not None:
                self.test_targets = ModifyStream(self.test_targets, label_one_hot)
                if self.sequence_len:
                    self.test_targets = self._subsequence(self.test_targets)
Example #7
0
    def __init__(self,
                 binary=False,
                 binary_cutoff=0.5,
                 one_hot=False,
                 concat_train_valid=False,
                 dataset_dir='../../datasets',
                 sequence_number=0,
                 rng=None):
        """
        Parameters
        ----------
        binary : bool, optional
            Flag to binarize the input images.
        binary_cutoff : float, optional
            If you want to binarize the input images, what threshold value to use.
        one_hot : bool, optional
            Flag to convert the labels to one-hot encoding rather than their normal integers.
        concat_train_valid : bool, optional
            Flag to concatenate the training and validation datasets together. This would be the original split.
        dataset_dir : str, optional
            The `dataset_dir` parameter to a ``FileDataset``.
        sequence_number : int, optional
            The sequence method to use if we want to put the input images into a specific order. 0 defaults to random.
        rng : random, optional
            The random number generator to use when sequencing.
        """
        # instantiate the Dataset class to install the dataset from the url
        log.info('Loading MNIST with binary=%s and one_hot=%s', str(binary),
                 str(one_hot))

        filename = 'mnist.pkl.gz'
        source = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'

        super(MNIST, self).__init__(filename=filename,
                                    source=source,
                                    dataset_dir=dataset_dir)

        # self.dataset_location now contains the os path to the dataset file
        # self.file_type tells how to load the dataset
        # load the dataset into memory
        if self.file_type is file_ops.GZ:
            (self.train_X,
             self.train_Y), (self.valid_X,
                             self.valid_Y), (self.test_X,
                                             self.test_Y) = pickle.load(
                                                 gzip.open(
                                                     self.dataset_location,
                                                     'rb'))
        else:
            (self.train_X,
             self.train_Y), (self.valid_X,
                             self.valid_Y), (self.test_X,
                                             self.test_Y) = pickle.load(
                                                 open(self.dataset_location,
                                                      'r'))

        if concat_train_valid:
            log.debug('Concatenating train and valid sets together...')
            self.train_X = numpy.concatenate((self.train_X, self.valid_X))
            self.train_Y = numpy.concatenate((self.train_Y, self.valid_Y))

        # sequence the dataset
        if sequence_number is not None:
            self.sequence(sequence_number=sequence_number, rng=rng)

        # make optional binary
        if binary:
            log.debug('Making MNIST X values binary with cutoff %s',
                      str(binary_cutoff))
            self.train_X = binarize(self.train_X, binary_cutoff)
            self.valid_X = binarize(self.valid_X, binary_cutoff)
            self.test_X = binarize(self.test_X, binary_cutoff)

        # make optional one-hot labels
        if one_hot:
            self.train_Y = numpy_one_hot(self.train_Y, n_classes=10)
            self.valid_Y = numpy_one_hot(self.valid_Y, n_classes=10)
            self.test_Y = numpy_one_hot(self.test_Y, n_classes=10)

        log.debug("loading datasets into shared variables")
        self.train_X = dataset_shared(self.train_X,
                                      name='mnist_train_x',
                                      borrow=True)
        self.train_Y = dataset_shared(self.train_Y,
                                      name='mnist_train_y',
                                      borrow=True)

        self.valid_X = dataset_shared(self.valid_X,
                                      name='mnist_valid_x',
                                      borrow=True)
        self.valid_Y = dataset_shared(self.valid_Y,
                                      name='mnist_valid_y',
                                      borrow=True)

        self.test_X = dataset_shared(self.test_X,
                                     name='mnist_test_x',
                                     borrow=True)
        self.test_Y = dataset_shared(self.test_Y,
                                     name='mnist_test_y',
                                     borrow=True)
Example #8
0
    def __init__(self,
                 binary=False,
                 binary_cutoff=0.5,
                 one_hot=False,
                 concat_train_valid=False,
                 sequence_number=0,
                 seq_3d=False,
                 seq_length=30,
                 rng=None,
                 path=mnist_path,
                 source=mnist_source):
        """
        Parameters
        ----------
        binary : bool, optional
            Flag to binarize the input images.
        binary_cutoff : float, optional
            If you want to binarize the input images, what threshold value to use.
        one_hot : bool, optional
            Flag to convert the labels to one-hot encoding rather than their normal integers.
        concat_train_valid : bool, optional
            Flag to concatenate the training and validation datasets together. This would be the original split.
        sequence_number : int, optional
            The sequence method to use if we want to put the input images into a specific order. 0 defaults to random.
        seq_3d : bool, optional
            When sequencing, whether the output should be
            3D tensors (batches, subsequences, data) or 2D (sequence, data).
        rng : random, optional
            The random number generator to use when sequencing.
        path : str, optional
            The `path` parameter to a ``FileDataset``.
        source : str, optional
            The `source` parameter to a ``FileDataset``.
        """
        # instantiate the Dataset class to install the dataset from the url
        log.info('Loading MNIST with binary=%s and one_hot=%s', str(binary),
                 str(one_hot))

        super(MNIST, self).__init__(path=path, source=source)

        # self.path now contains the os path to the dataset file
        # self.file_type tells how to load the dataset
        # load the dataset into memory
        if self.file_type is file_ops.GZ:
            (self.train_inputs, self.train_targets), \
            (self.valid_inputs, self.valid_targets), \
            (self.test_inputs, self.test_targets) = pickle.load(
                gzip.open(self.path, 'rb')
            )
        else:
            (self.train_inputs, self.train_targets), \
            (self.valid_inputs, self.valid_targets), \
            (self.test_inputs, self.test_targets) = pickle.load(
                open(self.path, 'r')
            )

        if concat_train_valid:
            log.debug('Concatenating train and valid sets together...')
            self.train_inputs = numpy.concatenate(
                (self.train_inputs, self.valid_inputs))
            self.train_targets = numpy.concatenate(
                (self.train_targets, self.valid_targets))

        # sequence the dataset
        if sequence_number is not None:
            self._sequence(sequence_number=sequence_number, rng=rng)

        # make optional binary
        if binary:
            log.debug('Making MNIST X values binary with cutoff %s',
                      str(binary_cutoff))
            self.train_inputs = binarize(self.train_inputs, binary_cutoff)
            self.valid_inputs = binarize(self.valid_inputs, binary_cutoff)
            self.test_inputs = binarize(self.test_inputs, binary_cutoff)

        # make optional one-hot labels
        if one_hot:
            self.train_targets = numpy_one_hot(self.train_targets,
                                               n_classes=10)
            self.valid_targets = numpy_one_hot(self.valid_targets,
                                               n_classes=10)
            self.test_targets = numpy_one_hot(self.test_targets, n_classes=10)

        # optionally make 3D instead of 2D
        if seq_3d:
            log.debug("Making 3D....")
            # chop up into sequences of length seq_length
            # first make sure to chop off the remainder of the data so seq_length can divide evenly.
            if self.train_inputs.shape[0] % seq_length != 0:
                length, dim = self.train_inputs.shape
                if self.train_targets.ndim == 1:
                    ydim = 1
                else:
                    ydim = self.train_targets.shape[-1]
                self.train_inputs = self.train_inputs[:seq_length *
                                                      math.floor(length /
                                                                 seq_length)]
                self.train_targets = self.train_targets[:seq_length *
                                                        math.floor(length /
                                                                   seq_length)]
                # now create the 3D tensor of sequences - they will be (num_sequences, sequence_size, 784)
                self.train_inputs = numpy.reshape(
                    self.train_inputs, (length / seq_length, seq_length, dim))
                self.train_targets = numpy.reshape(
                    self.train_targets,
                    (length / seq_length, seq_length, ydim))

            if self.valid_inputs.shape[0] % seq_length != 0:
                length, dim = self.valid_inputs.shape
                if self.valid_targets.ndim == 1:
                    ydim = 1
                else:
                    ydim = self.valid_targets.shape[-1]
                self.valid_inputs = self.valid_inputs[:seq_length *
                                                      math.floor(length /
                                                                 seq_length)]
                self.valid_targets = self.valid_targets[:seq_length *
                                                        math.floor(length /
                                                                   seq_length)]
                # now create the 3D tensor of sequences - they will be (num_sequences, sequence_size, 784)
                self.valid_inputs = numpy.reshape(
                    self.valid_inputs, (length / seq_length, seq_length, dim))
                self.valid_targets = numpy.reshape(
                    self.valid_targets,
                    (length / seq_length, seq_length, ydim))

            if self.test_inputs.shape[0] % seq_length != 0:
                length, dim = self.test_inputs.shape
                if self.test_targets.ndim == 1:
                    ydim = 1
                else:
                    ydim = self.test_targets.shape[-1]
                self.test_inputs = self.test_inputs[:seq_length *
                                                    math.floor(length /
                                                               seq_length)]
                self.test_targets = self.test_targets[:seq_length *
                                                      math.floor(length /
                                                                 seq_length)]
                # now create the 3D tensor of sequences - they will be (num_sequences, sequence_size, 784)
                self.test_inputs = numpy.reshape(
                    self.test_inputs, (length / seq_length, seq_length, dim))
                self.test_targets = numpy.reshape(
                    self.test_targets, (length / seq_length, seq_length, ydim))

            self._train_shape = self.train_inputs.shape
            self._valid_shape = self.valid_inputs.shape
            self._test_shape = self.test_inputs.shape
            log.debug('Train shape is: %s', str(self._train_shape))
            log.debug('Valid shape is: %s', str(self._valid_shape))
            log.debug('Test shape is: %s', str(self._test_shape))
Example #9
0
    def __init__(self,
                 path,
                 source=None,
                 train_filter=None,
                 valid_filter=None,
                 test_filter=None,
                 inputs_preprocess=None,
                 targets_preprocess=None,
                 vocab=None,
                 label_vocab=None,
                 unk_token="<UNK>",
                 level="char",
                 target_n_future=None,
                 sequence_length=False):
        """
        Initialize a text-based dataset. It will output one-hot vector encodings for the appropriate level (word,
        char, line).

        Parameters
        ----------
        path : str
            The name of the file or directory for the dataset.
        source : str, optional
            The URL path for downloading the dataset (if applicable).
        train_filter : regex string or compiled regex object, optional
            The regular expression filter to match training file names against (if applicable).
        valid_filter : regex string or compiled regex object, optional
            The regular expression filter to match validation file names against (if applicable).
        test_filter : regex string or compiled regex object, optional
            The regular expression filter to match testing file names against (if applicable).
        inputs_preprocess : function, optional
            A preprocessing function to apply to input data. This function will be applied to each line
            from the files in `path`, and if it creates a list of elements, each element will be yielded as the
            input data separately. For example, the function could be ``lambda line: (line.split(',')[0]).lower()``
            to grab a string before a comma on each line and lowercase it. Preprocessing will happen before any
            tokenization is applied i.e. tokenizing and processing are composed as tokenize(preprocess(line)).
        targets_preprocess : function, optional
            A preprocessing function to apply to targets data. This function will be applied to each line from
            the files in `path`, and if it creates a list of elements, each element will be yielded as the target
            label data separately. For example, the function could be ``lambda line: (line.split(',')[1]).lower()``
            to grab a label after a comma on each line and lowercase it. Tokenization will not be applied to data
            yielded from the targets' preprocessing.
        vocab : dict, optional
            A starting dictionary to use when converting tokens to numbers.
        label_vocab : dict, optional
            A starting dictionary to use when converting labels (targets) to numbers.
        unk_token : str
            The representation for an unknown token to use in the vocab dictionary.
        level : str
            Either ``char``, ``word``, or ``line``, saying how to process the text.
            For ``char``, data will be character-level.
            For ``word``, data will be split by whitespace.
            For ``line``, data will be split by newline.
        target_n_future : int, optional
            For creating language models that predict tokens in the future, this determines the skip size (number of
            steps in the future) that the language model will try to predict as its target. Most language models will
            have target_n_future=1. If `target_n_future` is not None, the targets will be created from the inputs
            (but still apply targets_preprocess instead of inputs_preprocess if it is different).
        sequence_length : int, optional
            The maximum length of subsequences to iterate over this dataset. If this is None or False, the data
            will just be supplied as a stream of one-hot vectors rather than broken into 2-D one-hot vector sequences.
        """
        # Figure out if we want characters, words, or lines processed, and create the processing function
        # to compose on top of the preprocessing function arguments.
        level = level.lower()
        if level == "char":
            tokenize = lambda s: list(s)
        elif level == "word":
            if NLTK_AVAILABLE:
                tokenize = lambda s: nltk.tokenize.word_tokenize(s)
            else:
                warnings.warn(
                    "NLTK isn't installed - going to split strings by whitespace. Highly recommended "
                    "that you install nltk for better word tokenization.")
                tokenize = lambda s: s.split()
        elif level == "line":
            tokenize = lambda s: [s]
        else:
            tokenize = None

        if sequence_length:
            assert sequence_length > 1, "Need to have a sequence_length greater than 1, found %d" % sequence_length
        self.sequence_len = sequence_length

        # modify our file stream's processors to work with the appropriate level!
        # if target_n_future is not none, we are assuming that this is a language model and that we
        # should tokenize the target
        if target_n_future is not None:
            targets_preprocess = compose(tokenize, inputs_preprocess)
        inputs_preprocess = compose(tokenize, inputs_preprocess)

        # call super to create the data streams
        super(TextDataset,
              self).__init__(path=path,
                             source=source,
                             train_filter=train_filter,
                             valid_filter=valid_filter,
                             test_filter=test_filter,
                             inputs_preprocess=inputs_preprocess,
                             targets_preprocess=targets_preprocess)
        # after this call, train_inputs, train_targets, etc. are all lists or None.

        # determine if this is a language model, and adjust the stream accordingly to use the inputs as the targets
        if target_n_future is not None:
            self.train_targets = FileStream(path, train_filter,
                                            targets_preprocess,
                                            target_n_future)
            if valid_filter is not None:
                self.valid_targets = FileStream(path, valid_filter,
                                                targets_preprocess,
                                                target_n_future)
            if test_filter is not None:
                self.test_targets = FileStream(path, test_filter,
                                               targets_preprocess,
                                               target_n_future)

        # Create our vocab dictionary if it doesn't exist!
        self.unk_token = unk_token
        vocab_inputs = [self.train_inputs] + (self.valid_inputs or [])
        self.vocab = vocab or self.compile_vocab(
            itertools.chain(*vocab_inputs))
        vocab_len = len(self.vocab)
        self.vocab_inverse = {v: k for k, v in self.vocab.items()}

        # Now modify our various inputs streams with one-hot versions using the vocab dictionary.
        # (making sure they remain as lists to satisfy the superclass condition)
        rep = lambda token: self.vocab.get(token, self.vocab.get(self.unk_token
                                                                 ))
        one_hot = lambda token: numpy_one_hot([rep(token)],
                                              n_classes=vocab_len)[0]
        self.train_inputs = ModifyStream(self.train_inputs, one_hot)
        if self.sequence_len:
            self.train_inputs = self._subsequence(self.train_inputs)
        if self.valid_inputs is not None:
            self.valid_inputs = ModifyStream(self.valid_inputs, one_hot)
            if self.sequence_len:
                self.valid_inputs = self._subsequence(self.valid_inputs)
        if self.test_inputs is not None:
            self.test_inputs = ModifyStream(self.test_inputs, one_hot)
            if self.sequence_len:
                self.valid_inputs = self._subsequence(self.valid_inputs)

        # Now deal with possible output streams (either tokenizing it using the supplied label dictionary,
        # creating the label dictionary, or using the vocab dictionary if it is a language model
        # (target_n_future is not none)
        if self.train_targets is not None and target_n_future is None:
            vocab_inputs = [self.train_targets] + (self.valid_targets or [])
            self.label_vocab = label_vocab or \
                               self.compile_vocab(itertools.chain(*vocab_inputs))
            self.label_vocab_inverse = {
                v: k
                for k, v in self.label_vocab.items()
            }
        # if this is a language model, label vocab is same as input vocab
        elif target_n_future is not None:
            self.label_vocab = self.vocab
            self.label_vocab_inverse = self.vocab_inverse
        else:
            self.label_vocab = None
            self.label_vocab_inverse = None

        # now modify the output streams with the one-hot representation using the vocab (making sure they remain
        # as lists to satisfy the superclass condition)
        if self.label_vocab is not None:
            label_vocab_len = len(self.label_vocab)
            label_rep = lambda token: self.label_vocab.get(
                token, self.label_vocab.get(self.unk_token))
            label_one_hot = lambda token: numpy_one_hot(
                [label_rep(token)], n_classes=label_vocab_len)[0]
            if self.train_targets is not None:
                self.train_targets = ModifyStream(self.train_targets,
                                                  label_one_hot)
                if self.sequence_len:
                    self.train_targets = self._subsequence(self.train_targets)
            if self.valid_targets is not None:
                self.valid_targets = ModifyStream(self.valid_targets,
                                                  label_one_hot)
                if self.sequence_len:
                    self.valid_targets = self._subsequence(self.valid_targets)
            if self.test_targets is not None:
                self.test_targets = ModifyStream(self.test_targets,
                                                 label_one_hot)
                if self.sequence_len:
                    self.test_targets = self._subsequence(self.test_targets)
Example #10
0
    def __init__(self, binary=False, binary_cutoff=0.5, one_hot=False, concat_train_valid=False,
                 dataset_dir='../../datasets', sequence_number=0, rng=None):
        """
        Parameters
        ----------
        binary : bool, optional
            Flag to binarize the input images.
        binary_cutoff : float, optional
            If you want to binarize the input images, what threshold value to use.
        one_hot : bool, optional
            Flag to convert the labels to one-hot encoding rather than their normal integers.
        concat_train_valid : bool, optional
            Flag to concatenate the training and validation datasets together. This would be the original split.
        dataset_dir : str, optional
            The `dataset_dir` parameter to a ``FileDataset``.
        sequence_number : int, optional
            The sequence method to use if we want to put the input images into a specific order. 0 defaults to random.
        rng : random, optional
            The random number generator to use when sequencing.
        """
        # instantiate the Dataset class to install the dataset from the url
        log.info('Loading MNIST with binary=%s and one_hot=%s', str(binary), str(one_hot))

        filename = 'mnist.pkl.gz'
        source = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'

        super(MNIST, self).__init__(filename=filename, source=source, dataset_dir=dataset_dir)

        # self.dataset_location now contains the os path to the dataset file
        # self.file_type tells how to load the dataset
        # load the dataset into memory
        if self.file_type is file_ops.GZ:
            (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle.load(
                gzip.open(self.dataset_location, 'rb')
            )
        else:
            (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle.load(
                open(self.dataset_location, 'r')
            )

        if concat_train_valid:
            log.debug('Concatenating train and valid sets together...')
            self.train_X = numpy.concatenate((self.train_X, self.valid_X))
            self.train_Y = numpy.concatenate((self.train_Y, self.valid_Y))

        # sequence the dataset
        if sequence_number is not None:
            self.sequence(sequence_number=sequence_number, rng=rng)

        # make optional binary
        if binary:
            log.debug('Making MNIST X values binary with cutoff %s', str(binary_cutoff))
            self.train_X = binarize(self.train_X, binary_cutoff)
            self.valid_X = binarize(self.valid_X, binary_cutoff)
            self.test_X  = binarize(self.test_X, binary_cutoff)

        # make optional one-hot labels
        if one_hot:
            self.train_Y = numpy_one_hot(self.train_Y, n_classes=10)
            self.valid_Y = numpy_one_hot(self.valid_Y, n_classes=10)
            self.test_Y  = numpy_one_hot(self.test_Y, n_classes=10)

        log.debug("loading datasets into shared variables")
        self.train_X = dataset_shared(self.train_X, name='mnist_train_x', borrow=True)
        self.train_Y = dataset_shared(self.train_Y, name='mnist_train_y', borrow=True)

        self.valid_X = dataset_shared(self.valid_X, name='mnist_valid_x', borrow=True)
        self.valid_Y = dataset_shared(self.valid_Y, name='mnist_valid_y', borrow=True)

        self.test_X = dataset_shared(self.test_X, name='mnist_test_x', borrow=True)
        self.test_Y = dataset_shared(self.test_Y, name='mnist_test_y', borrow=True)
Example #11
0
    def __init__(self, binary=False, binary_cutoff=0.5, one_hot=False, concat_train_valid=False,
                 sequence_number=0, seq_3d=False, seq_length=30, rng=None,
                 path=mnist_path,
                 source=mnist_source):
        """
        Parameters
        ----------
        binary : bool, optional
            Flag to binarize the input images.
        binary_cutoff : float, optional
            If you want to binarize the input images, what threshold value to use.
        one_hot : bool, optional
            Flag to convert the labels to one-hot encoding rather than their normal integers.
        concat_train_valid : bool, optional
            Flag to concatenate the training and validation datasets together. This would be the original split.
        sequence_number : int, optional
            The sequence method to use if we want to put the input images into a specific order. 0 defaults to random.
        seq_3d : bool, optional
            When sequencing, whether the output should be
            3D tensors (batches, subsequences, data) or 2D (sequence, data).
        rng : random, optional
            The random number generator to use when sequencing.
        path : str, optional
            The `path` parameter to a ``FileDataset``.
        source : str, optional
            The `source` parameter to a ``FileDataset``.
        """
        # instantiate the Dataset class to install the dataset from the url
        log.info('Loading MNIST with binary=%s and one_hot=%s', str(binary), str(one_hot))

        super(MNIST, self).__init__(path=path, source=source)

        # self.path now contains the os path to the dataset file
        # self.file_type tells how to load the dataset
        # load the dataset into memory
        if self.file_type is file_ops.GZ:
            (self.train_inputs, self.train_targets), \
            (self.valid_inputs, self.valid_targets), \
            (self.test_inputs, self.test_targets) = pickle.load(
                gzip.open(self.path, 'rb')
            )
        else:
            (self.train_inputs, self.train_targets), \
            (self.valid_inputs, self.valid_targets), \
            (self.test_inputs, self.test_targets) = pickle.load(
                open(self.path, 'r')
            )

        if concat_train_valid:
            log.debug('Concatenating train and valid sets together...')
            self.train_inputs = numpy.concatenate((self.train_inputs, self.valid_inputs))
            self.train_targets = numpy.concatenate((self.train_targets, self.valid_targets))

        # sequence the dataset
        if sequence_number is not None:
            self._sequence(sequence_number=sequence_number, rng=rng)

        # make optional binary
        if binary:
            log.debug('Making MNIST X values binary with cutoff %s', str(binary_cutoff))
            self.train_inputs = binarize(self.train_inputs, binary_cutoff)
            self.valid_inputs = binarize(self.valid_inputs, binary_cutoff)
            self.test_inputs  = binarize(self.test_inputs, binary_cutoff)

        # make optional one-hot labels
        if one_hot:
            self.train_targets = numpy_one_hot(self.train_targets, n_classes=10)
            self.valid_targets = numpy_one_hot(self.valid_targets, n_classes=10)
            self.test_targets  = numpy_one_hot(self.test_targets, n_classes=10)

        # optionally make 3D instead of 2D
        if seq_3d:
            log.debug("Making 3D....")
            # chop up into sequences of length seq_length
            # first make sure to chop off the remainder of the data so seq_length can divide evenly.
            if self.train_inputs.shape[0] % seq_length != 0:
                length, dim = self.train_inputs.shape
                if self.train_targets.ndim == 1:
                    ydim = 1
                else:
                    ydim = self.train_targets.shape[-1]
                self.train_inputs = self.train_inputs[:seq_length * math.floor(length / seq_length)]
                self.train_targets = self.train_targets[:seq_length * math.floor(length / seq_length)]
                # now create the 3D tensor of sequences - they will be (num_sequences, sequence_size, 784)
                self.train_inputs = numpy.reshape(self.train_inputs, (length / seq_length, seq_length, dim))
                self.train_targets = numpy.reshape(self.train_targets, (length / seq_length, seq_length, ydim))

            if self.valid_inputs.shape[0] % seq_length != 0:
                length, dim = self.valid_inputs.shape
                if self.valid_targets.ndim == 1:
                    ydim = 1
                else:
                    ydim = self.valid_targets.shape[-1]
                self.valid_inputs = self.valid_inputs[:seq_length * math.floor(length / seq_length)]
                self.valid_targets = self.valid_targets[:seq_length * math.floor(length / seq_length)]
                # now create the 3D tensor of sequences - they will be (num_sequences, sequence_size, 784)
                self.valid_inputs = numpy.reshape(self.valid_inputs, (length / seq_length, seq_length, dim))
                self.valid_targets = numpy.reshape(self.valid_targets, (length / seq_length, seq_length, ydim))

            if self.test_inputs.shape[0] % seq_length != 0:
                length, dim = self.test_inputs.shape
                if self.test_targets.ndim == 1:
                    ydim = 1
                else:
                    ydim = self.test_targets.shape[-1]
                self.test_inputs = self.test_inputs[:seq_length * math.floor(length / seq_length)]
                self.test_targets = self.test_targets[:seq_length * math.floor(length / seq_length)]
                # now create the 3D tensor of sequences - they will be (num_sequences, sequence_size, 784)
                self.test_inputs = numpy.reshape(self.test_inputs, (length / seq_length, seq_length, dim))
                self.test_targets = numpy.reshape(self.test_targets, (length / seq_length, seq_length, ydim))

            self._train_shape = self.train_inputs.shape
            self._valid_shape = self.valid_inputs.shape
            self._test_shape = self.test_inputs.shape
            log.debug('Train shape is: %s', str(self._train_shape))
            log.debug('Valid shape is: %s', str(self._valid_shape))
            log.debug('Test shape is: %s', str(self._test_shape))
Example #12
0
    def __init__(self,
                 flatten=False,
                 binary_cutoff=False,
                 one_hot=False,
                 concat_train_valid=False,
                 path=mnist_path,
                 source=mnist_source):
        """
        Parameters
        ----------
        flatten : bool, optional
            Flag to flatten the 2D images into 1D vectors.
        binary_cutoff : float, optional
            If you want to binarize the input images, what threshold value to use.
        one_hot : bool, optional
            Flag to convert the labels to one-hot encoding rather than their normal integers.
        concat_train_valid : bool, optional
            Flag to concatenate the training and validation datasets together. This would be the original split.
        path : str, optional
            The `path` parameter to a ``FileDataset``.
        source : str, optional
            The `source` parameter to a ``FileDataset``.
        """
        # instantiate the Dataset class to install the dataset from the url
        log.info("Loading MNIST with binary={!s} and one_hot={!s}".format(
            str(binary_cutoff), str(one_hot)))

        super(MNIST, self).__init__(path=path, source=source)

        # self.path now contains the os path to the dataset file
        # self.file_type tells how to load the dataset
        # load the dataset into memory
        if self.file_type is file_ops.GZ:
            (self.train_inputs, self.train_targets), \
            (self.valid_inputs, self.valid_targets), \
            (self.test_inputs, self.test_targets) = pickle.load(
                gzip.open(self.path, 'rb')
            )
        else:
            (self.train_inputs, self.train_targets), \
            (self.valid_inputs, self.valid_targets), \
            (self.test_inputs, self.test_targets) = pickle.load(
                open(self.path, 'r')
            )

        if concat_train_valid:
            log.debug("Concatenating train and valid sets together...")
            self.train_inputs = concatenate(
                (self.train_inputs, self.valid_inputs))
            self.train_targets = concatenate(
                (self.train_targets, self.valid_targets))

        # make optional binary
        if binary_cutoff:
            log.debug(
                "Making MNIST input values binary with cutoff {!s}".format(
                    str(binary_cutoff)))
            self.train_inputs = binarize(self.train_inputs, binary_cutoff)
            self.valid_inputs = binarize(self.valid_inputs, binary_cutoff)
            self.test_inputs = binarize(self.test_inputs, binary_cutoff)

        # make optional one-hot labels
        if one_hot:
            self.train_targets = numpy_one_hot(self.train_targets,
                                               n_classes=10)
            self.valid_targets = numpy_one_hot(self.valid_targets,
                                               n_classes=10)
            self.test_targets = numpy_one_hot(self.test_targets, n_classes=10)

        # This data source comes pre-flattened. If not flatten, then expand to (1, 28, 28)
        if not flatten:
            self.train_inputs = reshape(
                self.train_inputs, (self.train_inputs.shape[0], 1, 28, 28))
            self.valid_inputs = reshape(
                self.valid_inputs, (self.valid_inputs.shape[0], 1, 28, 28))
            self.test_inputs = reshape(self.test_inputs,
                                       (self.test_inputs.shape[0], 1, 28, 28))

        log.debug("MNIST train shape: {!s}, {!s}".format(
            self.train_inputs.shape, self.train_targets.shape))
        log.debug("MNIST valid shape: {!s}, {!s}".format(
            self.valid_inputs.shape, self.valid_targets.shape))
        log.debug("MNIST test shape: {!s}, {!s}".format(
            self.test_inputs.shape, self.test_targets.shape))
Example #13
0
    def __init__(
            self,
            train_split=0.95,
            valid_split=0.05,
            one_hot=False,
            path='datasets/cifar-10-batches-py/',
            source='http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'):
        """
        Parameters
        ----------
        train_split : float
            The percentage of data to be used for training.
        valid_split : float
            The percentage of data to be used for validation.
            (leftover percentage from train and valid splits will be for testing).
        one_hot : bool, optional
            Flag to convert the labels to one-hot encoding rather than their normal integers.
        path : str, optional
            The `path` parameter to a ``FileDataset``.
        source : str, optional
            The `source` parameter to a ``FileDataset``.
        """
        assert (0. < train_split <=
                1.), "Train_split needs to be a fraction between (0, 1]."
        assert (0. <= valid_split <
                1.), "Valid_split needs to be a fraction between [0, 1)."
        assert train_split + valid_split <= 1., "Train_split + valid_split can't be greater than 1."
        # make test_split the leftover percentage!
        test_split = 1 - (train_split + valid_split)

        # instantiate the Dataset class to install the dataset from the url
        log.info('Loading CIFAR-10 with data split (%f, %f, %f)' %
                 (train_split, valid_split, test_split))

        super(CIFAR10, self).__init__(path=path, source=source)

        # extract out all the samples
        # (from keras https://github.com/fchollet/keras/blob/master/keras/datasets/cifar10.py)
        nb_samples = 50000
        X = numpy.zeros((nb_samples, 3, 32, 32), dtype="uint8")
        Y = numpy.zeros((nb_samples, ), dtype="uint8")
        for i in range(1, 6):
            fpath = os.path.join(self.path, 'data_batch_%d' % i)
            with open(fpath, 'rb') as f:
                d = pickle.load(f)
            data = d['data']
            labels = d['labels']

            data = data.reshape(data.shape[0], 3, 32, 32)
            X[(i - 1) * 10000:i * 10000, :, :, :] = data
            Y[(i - 1) * 10000:i * 10000] = labels

        if one_hot:
            Y = numpy_one_hot(Y, n_classes=10)

        length = X.shape[0]

        train_len = int(math.floor(length * train_split))
        valid_len = int(math.floor(length * valid_split))

        # divide into train, valid, and test sets!
        self.train_inputs = X[:train_len]
        self.train_targets = Y[:train_len]

        if valid_split > 0:
            self.valid_inputs = X[train_len:train_len + valid_len]
            self.valid_targets = Y[train_len:train_len + valid_len]
        else:
            self.valid_inputs = None
            self.valid_targets = None

        if test_split > 0:
            self.test_inputs = X[train_len + valid_len:]
            self.test_targets = Y[train_len + valid_len:]
        else:
            self.test_inputs = None
            self.test_targets = None
Example #14
0
    def __init__(self, source_dir, size_key, one_hot=False, train_split=0.9, valid_split=0.1):
        print("Getting dataset %s" % size_key)
        # grab the datasets from the preprocessed files
        datasets = [(numpy.load(f), get_label(f)) for f in find_processed_files(source_dir, size_key)]
        # make sure they are all the correct dimensionality
        datasets = [(data.shape, data, label) for data, label in datasets
                    if data.shape[1] == sizes[size_key] and label is not None]

        print("Found %d examples" % len(datasets))

        # shuffle!
        random.seed(1)
        random.shuffle(datasets)

        # shapes
        shapes = [shape for shape, _, _ in datasets]
        # data
        dataset = [data for _, data, _ in datasets]
        # labels
        labels = numpy.asarray([label for _, _, label in datasets], dtype='int8')
        # make the labels into one-hot vectors
        if one_hot:
            labels = numpy_one_hot(labels, n_classes=5)

        train_len = int(math.floor(train_split * len(dataset)))
        valid_len = int(math.floor(valid_split * len(dataset)))
        valid_stop = train_len + valid_len
        print("# train: %d examples" % train_len)

        train_datasets = dataset[:train_len]
        train_labels = labels[:train_len]

        if valid_len > 0:
            valid_datasets = dataset[train_len:valid_stop]
            valid_labels = labels[train_len:valid_stop]
        else:
            valid_datasets = []

        if train_len + valid_len < len(dataset):
            test_datasets = dataset[valid_stop:]
            test_labels = labels[valid_stop:]
        else:
            test_datasets = []

        # median_train_len = int(numpy.median(numpy.asarray(shapes[:train_len]), axis=0)[0])
        min_train_len = int(numpy.min(numpy.asarray(shapes[:train_len]), axis=0)[0])
        max_train_len = int(numpy.max(numpy.asarray(shapes[:train_len]), axis=0)[0])
        # train = numpy.array([data[:min_train_len] for data in train_datasets], dtype='float32')
        train = numpy.array(
            [numpy.pad(data, [(0, max_train_len-data.shape[0]), (0, 0)], mode='constant') for data in train_datasets],
            dtype='float32'
        )
        self.train_shape = train.shape
        self.train = (dataset_shared(train, borrow=True), dataset_shared(train_labels, borrow=True))

        if len(valid_datasets) > 0:
            min_valid_len = int(numpy.min(numpy.asarray(shapes[train_len:valid_stop]), axis=0)[0])
            max_valid_len = int(numpy.max(numpy.asarray(shapes[train_len:valid_stop]), axis=0)[0])
            # valid = numpy.array([data[:min_valid_len] for data in valid_datasets], dtype='float32')
            valid = numpy.array(
                [numpy.pad(data, [(0, max_valid_len-data.shape[0]), (0, 0)], mode='constant') for data in
                 valid_datasets],
                dtype='float32'
            )
            self.valid_shape = valid.shape
            self.valid = (dataset_shared(valid, borrow=True), dataset_shared(valid_labels, borrow=True))
        else:
            self.valid = None, None
            self.valid_shape = None

        if len(test_datasets) > 0:
            min_test_len = int(numpy.min(numpy.asarray(shapes[valid_stop:]), axis=0)[0])
            max_test_len = int(numpy.max(numpy.asarray(shapes[valid_stop:]), axis=0)[0])
            # valid = numpy.array([data[:min_test_len] for data in test_datasets], dtype='float32')
            test = numpy.array(
                [numpy.pad(data, [(0, max_test_len - data.shape[0]), (0, 0)], mode='constant') for data in
                 test_datasets],
                dtype='float32'
            )
            self.test_shape = test.shape
            self.test = (dataset_shared(test, borrow=True), dataset_shared(test_labels, borrow=True))
        else:
            self.test = None, None
            self.test_shape = None


        print("Train shape: %s" % str(self.train_shape))
        print("Valid shape: %s" % str(self.valid_shape))
        print("Test shape: %s" % str(self.test_shape))
        print("Dataset %s initialized!" % size_key)