def __init__(self, binary=False, one_hot=False, concat_train_valid=True, dataset_dir='../../datasets', sequence_number=0, rng=None): # instantiate the Dataset class to install the dataset from the url log.info('Loading MNIST with binary=%s and one_hot=%s', str(binary), str(one_hot)) filename = 'mnist.pkl.gz' source = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' super(MNIST, self).__init__(filename=filename, source=source, dataset_dir=dataset_dir) # self.dataset_location now contains the os path to the dataset file # self.file_type tells how to load the dataset # load the dataset into memory if PY3: pickle_load = partial(pickle.load, encoding='bytes') else: pickle_load = partial(pickle.load) if self.file_type is file_ops.GZ: (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle_load( gzip.open(self.dataset_location, 'rb') ) else: (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle_load( open(self.dataset_location, 'r') ) if concat_train_valid: log.debug('Concatenating train and valid sets together...') self.train_X = numpy.concatenate((self.train_X, self.valid_X)) self.train_Y = numpy.concatenate((self.train_Y, self.valid_Y)) # sequence the dataset if sequence_number is not None: self.sequence(sequence_number=sequence_number, rng=rng) # make optional binary if binary: _binary_cutoff = 0.5 log.debug('Making MNIST X values binary with cutoff %s', str(_binary_cutoff)) self.train_X = (self.train_X > _binary_cutoff).astype('float32') self.valid_X = (self.valid_X > _binary_cutoff).astype('float32') self.test_X = (self.test_X > _binary_cutoff).astype('float32') # make optional one-hot labels if one_hot: self.train_Y = numpy_one_hot(self.train_Y, n_classes=10) self.valid_Y = numpy_one_hot(self.valid_Y, n_classes=10) self.test_Y = numpy_one_hot(self.test_Y, n_classes=10) log.debug("loading datasets into shared variables") self.train_X = dataset_shared(self.train_X, name='mnist_train_x', borrow=True) self.train_Y = dataset_shared(self.train_Y, name='mnist_train_y', borrow=True) self.valid_X = dataset_shared(self.valid_X, name='mnist_valid_x', borrow=True) self.valid_Y = dataset_shared(self.valid_Y, name='mnist_valid_y', borrow=True) self.test_X = dataset_shared(self.test_X, name='mnist_test_x', borrow=True) self.test_Y = dataset_shared(self.test_Y, name='mnist_test_y', borrow=True)
def setUp(self): # numpy array to test self.np = numpy.eye(10) # generator over word vectors to test words = "Hello\nWorld\nThis\nIs\nA\nTest!".split("\n") vocab = {char: n for char, n in zip(list(set(words)), range(len(set(words))))} words = [vocab[x] for x in words] self.words = numpy_one_hot(words, n_classes=len(vocab))
def setUp(self): # numpy array to test self.np = numpy.eye(10) # generator over word vectors to test words = "Hello\nWorld\nThis\nIs\nA\nTest!".split("\n") vocab = { char: n for char, n in zip(list(set(words)), range(len(set(words)))) } words = [vocab[x] for x in words] self.words = numpy_one_hot(words, n_classes=len(vocab))
def __process_str(data_str, vocab): # process the raw input data string data = [] for data_char in data_str: data.append(vocab.get(data_char, 0)) data = numpy_one_hot(numpy.asarray(data), n_classes=numpy.amax(vocab.values()) + 1) seq, dim = data.shape data = numpy.reshape(data, (1, seq, dim)) return data
def string_to_data(query): vocab = pickle.load(open('vocab.pkl', 'rb')) # process the raw input data string data = [] # get the integer encodings for data_char in query: data.append(vocab.get(data_char, 0)) # convert the integers to one-hot arrays data = numpy_one_hot(numpy.asarray(data), n_classes=numpy.amax(vocab.values()) + 1) # make 3D for model input seq, dim = data.shape data = numpy.reshape(data, (1, seq, dim)) return data
def __init__(self, path, source=None, train_filter=None, valid_filter=None, test_filter=None, inputs_preprocess=None, targets_preprocess=None, vocab=None, label_vocab=None, unk_token="<UNK>", level="char", target_n_future=None, sequence_length=False): """ Initialize a text-based dataset. Parameters ---------- path : str The name of the file or directory for the dataset. source : str, optional The URL path for downloading the dataset (if applicable). train_filter : regex string or compiled regex object, optional The regular expression filter to match training file names against (if applicable). valid_filter : regex string or compiled regex object, optional The regular expression filter to match validation file names against (if applicable). test_filter : regex string or compiled regex object, optional The regular expression filter to match testing file names against (if applicable). inputs_preprocess : function, optional A preprocessing function to apply to input data. This function will be applied to each line from the files in `path`, and if it creates a list of elements, each element will be yielded as the input data separately. For example, the function could be ``lambda line: (line.split(',')[0]).lower()`` to grab a string before a comma on each line and lowercase it. Preprocessing will happen before any tokenization is applied i.e. tokenizing and processing are composed as tokenize(preprocess(line)). targets_preprocess : function, optional A preprocessing function to apply to targets data. This function will be applied to each line from the files in `path`, and if it creates a list of elements, each element will be yielded as the target label data separately. For example, the function could be ``lambda line: (line.split(',')[1]).lower()`` to grab a label after a comma on each line and lowercase it. Tokenization will not be applied to data yielded from the targets' preprocessing. vocab : dict, optional A starting dictionary to use when converting tokens to numbers. label_vocab : dict, optional A starting dictionary to use when converting labels (targets) to numbers. unk_token : str The representation for an unknown token to use in the vocab dictionary. level : str Either ``char``, ``word``, or ``line``, saying how to process the text. For ``char``, data will be character-level. For ``word``, data will be split by whitespace. For ``line``, data will be split by newline. target_n_future : int, optional For creating language models that predict tokens in the future, this determines the skip size (number of steps in the future) that the language model will try to predict as its target. Most language models will have target_n_future=1. If `target_n_future` is not None, the targets will be created from the inputs (but still apply targets_preprocess instead of inputs_preprocess if it is different). sequence_length : int, optional The maximum length of subsequences to iterate over this dataset. If this is None or False, the data will just be supplied as a stream of one-hot vectors rather than broken into 2-D one-hot vector sequences. """ # Figure out if we want characters, words, or lines processed, and create the processing function # to compose on top of the preprocessing function arguments. level = level.lower() if level == "char": tokenize = lambda s: list(s) elif level == "word": if NLTK_AVAILABLE: tokenize = lambda s: nltk.tokenize.word_tokenize(s) else: warnings.warn("NLTK isn't installed - going to split strings by whitespace. Highly recommended " "that you install nltk for better word tokenization.") tokenize = lambda s: s.split() elif level == "line": tokenize = lambda s: [s] else: tokenize = None if sequence_length: assert sequence_length > 1, "Need to have a sequence_length greater than 1, found %d" % sequence_length self.sequence_len = sequence_length # modify our file stream's processors to work with the appropriate level! # if target_n_future is not none, we are assuming that this is a language model and that we should tokenize the target if target_n_future is not None: targets_preprocess = compose(tokenize, inputs_preprocess) inputs_preprocess = compose(tokenize, inputs_preprocess) # call super to create the data streams super(TextDataset, self).__init__(path=path, source=source, train_filter=train_filter, valid_filter=valid_filter, test_filter=test_filter, inputs_preprocess=inputs_preprocess, targets_preprocess=targets_preprocess) # after this call, train_inputs, train_targets, etc. are all lists or None. # determine if this is a language model, and adjust the stream accordingly to use the inputs as the targets if target_n_future is not None: self.train_targets = FileStream(path, train_filter, targets_preprocess, target_n_future) if valid_filter is not None: self.valid_targets = FileStream(path, valid_filter, targets_preprocess, target_n_future) if test_filter is not None: self.test_targets = FileStream(path, test_filter, targets_preprocess, target_n_future) # Create our vocab dictionary if it doesn't exist! self.unk_token = unk_token vocab_inputs = [self.train_inputs] + (self.valid_inputs or []) self.vocab = vocab or self.compile_vocab(itertools.chain(*vocab_inputs)) vocab_len = len(self.vocab) self.vocab_inverse = {v: k for k, v in self.vocab.items()} # Now modify our various inputs streams with one-hot versions using the vocab dictionary. # (making sure they remain as lists to satisfy the superclass condition) rep = lambda token: self.vocab.get(token, self.vocab.get(self.unk_token)) one_hot = lambda token: numpy_one_hot([rep(token)], n_classes=vocab_len)[0] self.train_inputs = ModifyStream(self.train_inputs, one_hot) if self.sequence_len: self.train_inputs = self._subsequence(self.train_inputs) if self.valid_inputs is not None: self.valid_inputs = ModifyStream(self.valid_inputs, one_hot) if self.sequence_len: self.valid_inputs = self._subsequence(self.valid_inputs) if self.test_inputs is not None: self.test_inputs = ModifyStream(self.test_inputs, one_hot) if self.sequence_len: self.valid_inputs = self._subsequence(self.valid_inputs) # Now deal with possible output streams (either tokenizing it using the supplied label dictionary, # creating the label dictionary, or using the vocab dictionary if it is a language model (target_n_future is not none) if self.train_targets is not None and target_n_future is None: vocab_inputs = [self.train_targets] + (self.valid_targets or []) self.label_vocab = label_vocab or \ self.compile_vocab(itertools.chain(*vocab_inputs)) self.label_vocab_inverse = {v: k for k, v in self.label_vocab.items()} # if this is a language model, label vocab is same as input vocab elif target_n_future is not None: self.label_vocab = self.vocab self.label_vocab_inverse = self.vocab_inverse else: self.label_vocab = None self.label_vocab_inverse = None # now modify the output streams with the one-hot representation using the vocab (making sure they remain # as lists to satisfy the superclass condition) if self.label_vocab is not None: label_vocab_len = len(self.label_vocab) label_rep = lambda token: self.label_vocab.get(token, self.label_vocab.get(self.unk_token)) label_one_hot = lambda token: numpy_one_hot([label_rep(token)], n_classes=label_vocab_len)[0] if self.train_targets is not None: self.train_targets = ModifyStream(self.train_targets, label_one_hot) if self.sequence_len: self.train_targets = self._subsequence(self.train_targets) if self.valid_targets is not None: self.valid_targets = ModifyStream(self.valid_targets, label_one_hot) if self.sequence_len: self.valid_targets = self._subsequence(self.valid_targets) if self.test_targets is not None: self.test_targets = ModifyStream(self.test_targets, label_one_hot) if self.sequence_len: self.test_targets = self._subsequence(self.test_targets)
def __init__(self, binary=False, binary_cutoff=0.5, one_hot=False, concat_train_valid=False, dataset_dir='../../datasets', sequence_number=0, rng=None): """ Parameters ---------- binary : bool, optional Flag to binarize the input images. binary_cutoff : float, optional If you want to binarize the input images, what threshold value to use. one_hot : bool, optional Flag to convert the labels to one-hot encoding rather than their normal integers. concat_train_valid : bool, optional Flag to concatenate the training and validation datasets together. This would be the original split. dataset_dir : str, optional The `dataset_dir` parameter to a ``FileDataset``. sequence_number : int, optional The sequence method to use if we want to put the input images into a specific order. 0 defaults to random. rng : random, optional The random number generator to use when sequencing. """ # instantiate the Dataset class to install the dataset from the url log.info('Loading MNIST with binary=%s and one_hot=%s', str(binary), str(one_hot)) filename = 'mnist.pkl.gz' source = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' super(MNIST, self).__init__(filename=filename, source=source, dataset_dir=dataset_dir) # self.dataset_location now contains the os path to the dataset file # self.file_type tells how to load the dataset # load the dataset into memory if self.file_type is file_ops.GZ: (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle.load( gzip.open( self.dataset_location, 'rb')) else: (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle.load( open(self.dataset_location, 'r')) if concat_train_valid: log.debug('Concatenating train and valid sets together...') self.train_X = numpy.concatenate((self.train_X, self.valid_X)) self.train_Y = numpy.concatenate((self.train_Y, self.valid_Y)) # sequence the dataset if sequence_number is not None: self.sequence(sequence_number=sequence_number, rng=rng) # make optional binary if binary: log.debug('Making MNIST X values binary with cutoff %s', str(binary_cutoff)) self.train_X = binarize(self.train_X, binary_cutoff) self.valid_X = binarize(self.valid_X, binary_cutoff) self.test_X = binarize(self.test_X, binary_cutoff) # make optional one-hot labels if one_hot: self.train_Y = numpy_one_hot(self.train_Y, n_classes=10) self.valid_Y = numpy_one_hot(self.valid_Y, n_classes=10) self.test_Y = numpy_one_hot(self.test_Y, n_classes=10) log.debug("loading datasets into shared variables") self.train_X = dataset_shared(self.train_X, name='mnist_train_x', borrow=True) self.train_Y = dataset_shared(self.train_Y, name='mnist_train_y', borrow=True) self.valid_X = dataset_shared(self.valid_X, name='mnist_valid_x', borrow=True) self.valid_Y = dataset_shared(self.valid_Y, name='mnist_valid_y', borrow=True) self.test_X = dataset_shared(self.test_X, name='mnist_test_x', borrow=True) self.test_Y = dataset_shared(self.test_Y, name='mnist_test_y', borrow=True)
def __init__(self, binary=False, binary_cutoff=0.5, one_hot=False, concat_train_valid=False, sequence_number=0, seq_3d=False, seq_length=30, rng=None, path=mnist_path, source=mnist_source): """ Parameters ---------- binary : bool, optional Flag to binarize the input images. binary_cutoff : float, optional If you want to binarize the input images, what threshold value to use. one_hot : bool, optional Flag to convert the labels to one-hot encoding rather than their normal integers. concat_train_valid : bool, optional Flag to concatenate the training and validation datasets together. This would be the original split. sequence_number : int, optional The sequence method to use if we want to put the input images into a specific order. 0 defaults to random. seq_3d : bool, optional When sequencing, whether the output should be 3D tensors (batches, subsequences, data) or 2D (sequence, data). rng : random, optional The random number generator to use when sequencing. path : str, optional The `path` parameter to a ``FileDataset``. source : str, optional The `source` parameter to a ``FileDataset``. """ # instantiate the Dataset class to install the dataset from the url log.info('Loading MNIST with binary=%s and one_hot=%s', str(binary), str(one_hot)) super(MNIST, self).__init__(path=path, source=source) # self.path now contains the os path to the dataset file # self.file_type tells how to load the dataset # load the dataset into memory if self.file_type is file_ops.GZ: (self.train_inputs, self.train_targets), \ (self.valid_inputs, self.valid_targets), \ (self.test_inputs, self.test_targets) = pickle.load( gzip.open(self.path, 'rb') ) else: (self.train_inputs, self.train_targets), \ (self.valid_inputs, self.valid_targets), \ (self.test_inputs, self.test_targets) = pickle.load( open(self.path, 'r') ) if concat_train_valid: log.debug('Concatenating train and valid sets together...') self.train_inputs = numpy.concatenate( (self.train_inputs, self.valid_inputs)) self.train_targets = numpy.concatenate( (self.train_targets, self.valid_targets)) # sequence the dataset if sequence_number is not None: self._sequence(sequence_number=sequence_number, rng=rng) # make optional binary if binary: log.debug('Making MNIST X values binary with cutoff %s', str(binary_cutoff)) self.train_inputs = binarize(self.train_inputs, binary_cutoff) self.valid_inputs = binarize(self.valid_inputs, binary_cutoff) self.test_inputs = binarize(self.test_inputs, binary_cutoff) # make optional one-hot labels if one_hot: self.train_targets = numpy_one_hot(self.train_targets, n_classes=10) self.valid_targets = numpy_one_hot(self.valid_targets, n_classes=10) self.test_targets = numpy_one_hot(self.test_targets, n_classes=10) # optionally make 3D instead of 2D if seq_3d: log.debug("Making 3D....") # chop up into sequences of length seq_length # first make sure to chop off the remainder of the data so seq_length can divide evenly. if self.train_inputs.shape[0] % seq_length != 0: length, dim = self.train_inputs.shape if self.train_targets.ndim == 1: ydim = 1 else: ydim = self.train_targets.shape[-1] self.train_inputs = self.train_inputs[:seq_length * math.floor(length / seq_length)] self.train_targets = self.train_targets[:seq_length * math.floor(length / seq_length)] # now create the 3D tensor of sequences - they will be (num_sequences, sequence_size, 784) self.train_inputs = numpy.reshape( self.train_inputs, (length / seq_length, seq_length, dim)) self.train_targets = numpy.reshape( self.train_targets, (length / seq_length, seq_length, ydim)) if self.valid_inputs.shape[0] % seq_length != 0: length, dim = self.valid_inputs.shape if self.valid_targets.ndim == 1: ydim = 1 else: ydim = self.valid_targets.shape[-1] self.valid_inputs = self.valid_inputs[:seq_length * math.floor(length / seq_length)] self.valid_targets = self.valid_targets[:seq_length * math.floor(length / seq_length)] # now create the 3D tensor of sequences - they will be (num_sequences, sequence_size, 784) self.valid_inputs = numpy.reshape( self.valid_inputs, (length / seq_length, seq_length, dim)) self.valid_targets = numpy.reshape( self.valid_targets, (length / seq_length, seq_length, ydim)) if self.test_inputs.shape[0] % seq_length != 0: length, dim = self.test_inputs.shape if self.test_targets.ndim == 1: ydim = 1 else: ydim = self.test_targets.shape[-1] self.test_inputs = self.test_inputs[:seq_length * math.floor(length / seq_length)] self.test_targets = self.test_targets[:seq_length * math.floor(length / seq_length)] # now create the 3D tensor of sequences - they will be (num_sequences, sequence_size, 784) self.test_inputs = numpy.reshape( self.test_inputs, (length / seq_length, seq_length, dim)) self.test_targets = numpy.reshape( self.test_targets, (length / seq_length, seq_length, ydim)) self._train_shape = self.train_inputs.shape self._valid_shape = self.valid_inputs.shape self._test_shape = self.test_inputs.shape log.debug('Train shape is: %s', str(self._train_shape)) log.debug('Valid shape is: %s', str(self._valid_shape)) log.debug('Test shape is: %s', str(self._test_shape))
def __init__(self, path, source=None, train_filter=None, valid_filter=None, test_filter=None, inputs_preprocess=None, targets_preprocess=None, vocab=None, label_vocab=None, unk_token="<UNK>", level="char", target_n_future=None, sequence_length=False): """ Initialize a text-based dataset. It will output one-hot vector encodings for the appropriate level (word, char, line). Parameters ---------- path : str The name of the file or directory for the dataset. source : str, optional The URL path for downloading the dataset (if applicable). train_filter : regex string or compiled regex object, optional The regular expression filter to match training file names against (if applicable). valid_filter : regex string or compiled regex object, optional The regular expression filter to match validation file names against (if applicable). test_filter : regex string or compiled regex object, optional The regular expression filter to match testing file names against (if applicable). inputs_preprocess : function, optional A preprocessing function to apply to input data. This function will be applied to each line from the files in `path`, and if it creates a list of elements, each element will be yielded as the input data separately. For example, the function could be ``lambda line: (line.split(',')[0]).lower()`` to grab a string before a comma on each line and lowercase it. Preprocessing will happen before any tokenization is applied i.e. tokenizing and processing are composed as tokenize(preprocess(line)). targets_preprocess : function, optional A preprocessing function to apply to targets data. This function will be applied to each line from the files in `path`, and if it creates a list of elements, each element will be yielded as the target label data separately. For example, the function could be ``lambda line: (line.split(',')[1]).lower()`` to grab a label after a comma on each line and lowercase it. Tokenization will not be applied to data yielded from the targets' preprocessing. vocab : dict, optional A starting dictionary to use when converting tokens to numbers. label_vocab : dict, optional A starting dictionary to use when converting labels (targets) to numbers. unk_token : str The representation for an unknown token to use in the vocab dictionary. level : str Either ``char``, ``word``, or ``line``, saying how to process the text. For ``char``, data will be character-level. For ``word``, data will be split by whitespace. For ``line``, data will be split by newline. target_n_future : int, optional For creating language models that predict tokens in the future, this determines the skip size (number of steps in the future) that the language model will try to predict as its target. Most language models will have target_n_future=1. If `target_n_future` is not None, the targets will be created from the inputs (but still apply targets_preprocess instead of inputs_preprocess if it is different). sequence_length : int, optional The maximum length of subsequences to iterate over this dataset. If this is None or False, the data will just be supplied as a stream of one-hot vectors rather than broken into 2-D one-hot vector sequences. """ # Figure out if we want characters, words, or lines processed, and create the processing function # to compose on top of the preprocessing function arguments. level = level.lower() if level == "char": tokenize = lambda s: list(s) elif level == "word": if NLTK_AVAILABLE: tokenize = lambda s: nltk.tokenize.word_tokenize(s) else: warnings.warn( "NLTK isn't installed - going to split strings by whitespace. Highly recommended " "that you install nltk for better word tokenization.") tokenize = lambda s: s.split() elif level == "line": tokenize = lambda s: [s] else: tokenize = None if sequence_length: assert sequence_length > 1, "Need to have a sequence_length greater than 1, found %d" % sequence_length self.sequence_len = sequence_length # modify our file stream's processors to work with the appropriate level! # if target_n_future is not none, we are assuming that this is a language model and that we # should tokenize the target if target_n_future is not None: targets_preprocess = compose(tokenize, inputs_preprocess) inputs_preprocess = compose(tokenize, inputs_preprocess) # call super to create the data streams super(TextDataset, self).__init__(path=path, source=source, train_filter=train_filter, valid_filter=valid_filter, test_filter=test_filter, inputs_preprocess=inputs_preprocess, targets_preprocess=targets_preprocess) # after this call, train_inputs, train_targets, etc. are all lists or None. # determine if this is a language model, and adjust the stream accordingly to use the inputs as the targets if target_n_future is not None: self.train_targets = FileStream(path, train_filter, targets_preprocess, target_n_future) if valid_filter is not None: self.valid_targets = FileStream(path, valid_filter, targets_preprocess, target_n_future) if test_filter is not None: self.test_targets = FileStream(path, test_filter, targets_preprocess, target_n_future) # Create our vocab dictionary if it doesn't exist! self.unk_token = unk_token vocab_inputs = [self.train_inputs] + (self.valid_inputs or []) self.vocab = vocab or self.compile_vocab( itertools.chain(*vocab_inputs)) vocab_len = len(self.vocab) self.vocab_inverse = {v: k for k, v in self.vocab.items()} # Now modify our various inputs streams with one-hot versions using the vocab dictionary. # (making sure they remain as lists to satisfy the superclass condition) rep = lambda token: self.vocab.get(token, self.vocab.get(self.unk_token )) one_hot = lambda token: numpy_one_hot([rep(token)], n_classes=vocab_len)[0] self.train_inputs = ModifyStream(self.train_inputs, one_hot) if self.sequence_len: self.train_inputs = self._subsequence(self.train_inputs) if self.valid_inputs is not None: self.valid_inputs = ModifyStream(self.valid_inputs, one_hot) if self.sequence_len: self.valid_inputs = self._subsequence(self.valid_inputs) if self.test_inputs is not None: self.test_inputs = ModifyStream(self.test_inputs, one_hot) if self.sequence_len: self.valid_inputs = self._subsequence(self.valid_inputs) # Now deal with possible output streams (either tokenizing it using the supplied label dictionary, # creating the label dictionary, or using the vocab dictionary if it is a language model # (target_n_future is not none) if self.train_targets is not None and target_n_future is None: vocab_inputs = [self.train_targets] + (self.valid_targets or []) self.label_vocab = label_vocab or \ self.compile_vocab(itertools.chain(*vocab_inputs)) self.label_vocab_inverse = { v: k for k, v in self.label_vocab.items() } # if this is a language model, label vocab is same as input vocab elif target_n_future is not None: self.label_vocab = self.vocab self.label_vocab_inverse = self.vocab_inverse else: self.label_vocab = None self.label_vocab_inverse = None # now modify the output streams with the one-hot representation using the vocab (making sure they remain # as lists to satisfy the superclass condition) if self.label_vocab is not None: label_vocab_len = len(self.label_vocab) label_rep = lambda token: self.label_vocab.get( token, self.label_vocab.get(self.unk_token)) label_one_hot = lambda token: numpy_one_hot( [label_rep(token)], n_classes=label_vocab_len)[0] if self.train_targets is not None: self.train_targets = ModifyStream(self.train_targets, label_one_hot) if self.sequence_len: self.train_targets = self._subsequence(self.train_targets) if self.valid_targets is not None: self.valid_targets = ModifyStream(self.valid_targets, label_one_hot) if self.sequence_len: self.valid_targets = self._subsequence(self.valid_targets) if self.test_targets is not None: self.test_targets = ModifyStream(self.test_targets, label_one_hot) if self.sequence_len: self.test_targets = self._subsequence(self.test_targets)
def __init__(self, binary=False, binary_cutoff=0.5, one_hot=False, concat_train_valid=False, dataset_dir='../../datasets', sequence_number=0, rng=None): """ Parameters ---------- binary : bool, optional Flag to binarize the input images. binary_cutoff : float, optional If you want to binarize the input images, what threshold value to use. one_hot : bool, optional Flag to convert the labels to one-hot encoding rather than their normal integers. concat_train_valid : bool, optional Flag to concatenate the training and validation datasets together. This would be the original split. dataset_dir : str, optional The `dataset_dir` parameter to a ``FileDataset``. sequence_number : int, optional The sequence method to use if we want to put the input images into a specific order. 0 defaults to random. rng : random, optional The random number generator to use when sequencing. """ # instantiate the Dataset class to install the dataset from the url log.info('Loading MNIST with binary=%s and one_hot=%s', str(binary), str(one_hot)) filename = 'mnist.pkl.gz' source = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' super(MNIST, self).__init__(filename=filename, source=source, dataset_dir=dataset_dir) # self.dataset_location now contains the os path to the dataset file # self.file_type tells how to load the dataset # load the dataset into memory if self.file_type is file_ops.GZ: (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle.load( gzip.open(self.dataset_location, 'rb') ) else: (self.train_X, self.train_Y), (self.valid_X, self.valid_Y), (self.test_X, self.test_Y) = pickle.load( open(self.dataset_location, 'r') ) if concat_train_valid: log.debug('Concatenating train and valid sets together...') self.train_X = numpy.concatenate((self.train_X, self.valid_X)) self.train_Y = numpy.concatenate((self.train_Y, self.valid_Y)) # sequence the dataset if sequence_number is not None: self.sequence(sequence_number=sequence_number, rng=rng) # make optional binary if binary: log.debug('Making MNIST X values binary with cutoff %s', str(binary_cutoff)) self.train_X = binarize(self.train_X, binary_cutoff) self.valid_X = binarize(self.valid_X, binary_cutoff) self.test_X = binarize(self.test_X, binary_cutoff) # make optional one-hot labels if one_hot: self.train_Y = numpy_one_hot(self.train_Y, n_classes=10) self.valid_Y = numpy_one_hot(self.valid_Y, n_classes=10) self.test_Y = numpy_one_hot(self.test_Y, n_classes=10) log.debug("loading datasets into shared variables") self.train_X = dataset_shared(self.train_X, name='mnist_train_x', borrow=True) self.train_Y = dataset_shared(self.train_Y, name='mnist_train_y', borrow=True) self.valid_X = dataset_shared(self.valid_X, name='mnist_valid_x', borrow=True) self.valid_Y = dataset_shared(self.valid_Y, name='mnist_valid_y', borrow=True) self.test_X = dataset_shared(self.test_X, name='mnist_test_x', borrow=True) self.test_Y = dataset_shared(self.test_Y, name='mnist_test_y', borrow=True)
def __init__(self, binary=False, binary_cutoff=0.5, one_hot=False, concat_train_valid=False, sequence_number=0, seq_3d=False, seq_length=30, rng=None, path=mnist_path, source=mnist_source): """ Parameters ---------- binary : bool, optional Flag to binarize the input images. binary_cutoff : float, optional If you want to binarize the input images, what threshold value to use. one_hot : bool, optional Flag to convert the labels to one-hot encoding rather than their normal integers. concat_train_valid : bool, optional Flag to concatenate the training and validation datasets together. This would be the original split. sequence_number : int, optional The sequence method to use if we want to put the input images into a specific order. 0 defaults to random. seq_3d : bool, optional When sequencing, whether the output should be 3D tensors (batches, subsequences, data) or 2D (sequence, data). rng : random, optional The random number generator to use when sequencing. path : str, optional The `path` parameter to a ``FileDataset``. source : str, optional The `source` parameter to a ``FileDataset``. """ # instantiate the Dataset class to install the dataset from the url log.info('Loading MNIST with binary=%s and one_hot=%s', str(binary), str(one_hot)) super(MNIST, self).__init__(path=path, source=source) # self.path now contains the os path to the dataset file # self.file_type tells how to load the dataset # load the dataset into memory if self.file_type is file_ops.GZ: (self.train_inputs, self.train_targets), \ (self.valid_inputs, self.valid_targets), \ (self.test_inputs, self.test_targets) = pickle.load( gzip.open(self.path, 'rb') ) else: (self.train_inputs, self.train_targets), \ (self.valid_inputs, self.valid_targets), \ (self.test_inputs, self.test_targets) = pickle.load( open(self.path, 'r') ) if concat_train_valid: log.debug('Concatenating train and valid sets together...') self.train_inputs = numpy.concatenate((self.train_inputs, self.valid_inputs)) self.train_targets = numpy.concatenate((self.train_targets, self.valid_targets)) # sequence the dataset if sequence_number is not None: self._sequence(sequence_number=sequence_number, rng=rng) # make optional binary if binary: log.debug('Making MNIST X values binary with cutoff %s', str(binary_cutoff)) self.train_inputs = binarize(self.train_inputs, binary_cutoff) self.valid_inputs = binarize(self.valid_inputs, binary_cutoff) self.test_inputs = binarize(self.test_inputs, binary_cutoff) # make optional one-hot labels if one_hot: self.train_targets = numpy_one_hot(self.train_targets, n_classes=10) self.valid_targets = numpy_one_hot(self.valid_targets, n_classes=10) self.test_targets = numpy_one_hot(self.test_targets, n_classes=10) # optionally make 3D instead of 2D if seq_3d: log.debug("Making 3D....") # chop up into sequences of length seq_length # first make sure to chop off the remainder of the data so seq_length can divide evenly. if self.train_inputs.shape[0] % seq_length != 0: length, dim = self.train_inputs.shape if self.train_targets.ndim == 1: ydim = 1 else: ydim = self.train_targets.shape[-1] self.train_inputs = self.train_inputs[:seq_length * math.floor(length / seq_length)] self.train_targets = self.train_targets[:seq_length * math.floor(length / seq_length)] # now create the 3D tensor of sequences - they will be (num_sequences, sequence_size, 784) self.train_inputs = numpy.reshape(self.train_inputs, (length / seq_length, seq_length, dim)) self.train_targets = numpy.reshape(self.train_targets, (length / seq_length, seq_length, ydim)) if self.valid_inputs.shape[0] % seq_length != 0: length, dim = self.valid_inputs.shape if self.valid_targets.ndim == 1: ydim = 1 else: ydim = self.valid_targets.shape[-1] self.valid_inputs = self.valid_inputs[:seq_length * math.floor(length / seq_length)] self.valid_targets = self.valid_targets[:seq_length * math.floor(length / seq_length)] # now create the 3D tensor of sequences - they will be (num_sequences, sequence_size, 784) self.valid_inputs = numpy.reshape(self.valid_inputs, (length / seq_length, seq_length, dim)) self.valid_targets = numpy.reshape(self.valid_targets, (length / seq_length, seq_length, ydim)) if self.test_inputs.shape[0] % seq_length != 0: length, dim = self.test_inputs.shape if self.test_targets.ndim == 1: ydim = 1 else: ydim = self.test_targets.shape[-1] self.test_inputs = self.test_inputs[:seq_length * math.floor(length / seq_length)] self.test_targets = self.test_targets[:seq_length * math.floor(length / seq_length)] # now create the 3D tensor of sequences - they will be (num_sequences, sequence_size, 784) self.test_inputs = numpy.reshape(self.test_inputs, (length / seq_length, seq_length, dim)) self.test_targets = numpy.reshape(self.test_targets, (length / seq_length, seq_length, ydim)) self._train_shape = self.train_inputs.shape self._valid_shape = self.valid_inputs.shape self._test_shape = self.test_inputs.shape log.debug('Train shape is: %s', str(self._train_shape)) log.debug('Valid shape is: %s', str(self._valid_shape)) log.debug('Test shape is: %s', str(self._test_shape))
def __init__(self, flatten=False, binary_cutoff=False, one_hot=False, concat_train_valid=False, path=mnist_path, source=mnist_source): """ Parameters ---------- flatten : bool, optional Flag to flatten the 2D images into 1D vectors. binary_cutoff : float, optional If you want to binarize the input images, what threshold value to use. one_hot : bool, optional Flag to convert the labels to one-hot encoding rather than their normal integers. concat_train_valid : bool, optional Flag to concatenate the training and validation datasets together. This would be the original split. path : str, optional The `path` parameter to a ``FileDataset``. source : str, optional The `source` parameter to a ``FileDataset``. """ # instantiate the Dataset class to install the dataset from the url log.info("Loading MNIST with binary={!s} and one_hot={!s}".format( str(binary_cutoff), str(one_hot))) super(MNIST, self).__init__(path=path, source=source) # self.path now contains the os path to the dataset file # self.file_type tells how to load the dataset # load the dataset into memory if self.file_type is file_ops.GZ: (self.train_inputs, self.train_targets), \ (self.valid_inputs, self.valid_targets), \ (self.test_inputs, self.test_targets) = pickle.load( gzip.open(self.path, 'rb') ) else: (self.train_inputs, self.train_targets), \ (self.valid_inputs, self.valid_targets), \ (self.test_inputs, self.test_targets) = pickle.load( open(self.path, 'r') ) if concat_train_valid: log.debug("Concatenating train and valid sets together...") self.train_inputs = concatenate( (self.train_inputs, self.valid_inputs)) self.train_targets = concatenate( (self.train_targets, self.valid_targets)) # make optional binary if binary_cutoff: log.debug( "Making MNIST input values binary with cutoff {!s}".format( str(binary_cutoff))) self.train_inputs = binarize(self.train_inputs, binary_cutoff) self.valid_inputs = binarize(self.valid_inputs, binary_cutoff) self.test_inputs = binarize(self.test_inputs, binary_cutoff) # make optional one-hot labels if one_hot: self.train_targets = numpy_one_hot(self.train_targets, n_classes=10) self.valid_targets = numpy_one_hot(self.valid_targets, n_classes=10) self.test_targets = numpy_one_hot(self.test_targets, n_classes=10) # This data source comes pre-flattened. If not flatten, then expand to (1, 28, 28) if not flatten: self.train_inputs = reshape( self.train_inputs, (self.train_inputs.shape[0], 1, 28, 28)) self.valid_inputs = reshape( self.valid_inputs, (self.valid_inputs.shape[0], 1, 28, 28)) self.test_inputs = reshape(self.test_inputs, (self.test_inputs.shape[0], 1, 28, 28)) log.debug("MNIST train shape: {!s}, {!s}".format( self.train_inputs.shape, self.train_targets.shape)) log.debug("MNIST valid shape: {!s}, {!s}".format( self.valid_inputs.shape, self.valid_targets.shape)) log.debug("MNIST test shape: {!s}, {!s}".format( self.test_inputs.shape, self.test_targets.shape))
def __init__( self, train_split=0.95, valid_split=0.05, one_hot=False, path='datasets/cifar-10-batches-py/', source='http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz'): """ Parameters ---------- train_split : float The percentage of data to be used for training. valid_split : float The percentage of data to be used for validation. (leftover percentage from train and valid splits will be for testing). one_hot : bool, optional Flag to convert the labels to one-hot encoding rather than their normal integers. path : str, optional The `path` parameter to a ``FileDataset``. source : str, optional The `source` parameter to a ``FileDataset``. """ assert (0. < train_split <= 1.), "Train_split needs to be a fraction between (0, 1]." assert (0. <= valid_split < 1.), "Valid_split needs to be a fraction between [0, 1)." assert train_split + valid_split <= 1., "Train_split + valid_split can't be greater than 1." # make test_split the leftover percentage! test_split = 1 - (train_split + valid_split) # instantiate the Dataset class to install the dataset from the url log.info('Loading CIFAR-10 with data split (%f, %f, %f)' % (train_split, valid_split, test_split)) super(CIFAR10, self).__init__(path=path, source=source) # extract out all the samples # (from keras https://github.com/fchollet/keras/blob/master/keras/datasets/cifar10.py) nb_samples = 50000 X = numpy.zeros((nb_samples, 3, 32, 32), dtype="uint8") Y = numpy.zeros((nb_samples, ), dtype="uint8") for i in range(1, 6): fpath = os.path.join(self.path, 'data_batch_%d' % i) with open(fpath, 'rb') as f: d = pickle.load(f) data = d['data'] labels = d['labels'] data = data.reshape(data.shape[0], 3, 32, 32) X[(i - 1) * 10000:i * 10000, :, :, :] = data Y[(i - 1) * 10000:i * 10000] = labels if one_hot: Y = numpy_one_hot(Y, n_classes=10) length = X.shape[0] train_len = int(math.floor(length * train_split)) valid_len = int(math.floor(length * valid_split)) # divide into train, valid, and test sets! self.train_inputs = X[:train_len] self.train_targets = Y[:train_len] if valid_split > 0: self.valid_inputs = X[train_len:train_len + valid_len] self.valid_targets = Y[train_len:train_len + valid_len] else: self.valid_inputs = None self.valid_targets = None if test_split > 0: self.test_inputs = X[train_len + valid_len:] self.test_targets = Y[train_len + valid_len:] else: self.test_inputs = None self.test_targets = None
def __init__(self, source_dir, size_key, one_hot=False, train_split=0.9, valid_split=0.1): print("Getting dataset %s" % size_key) # grab the datasets from the preprocessed files datasets = [(numpy.load(f), get_label(f)) for f in find_processed_files(source_dir, size_key)] # make sure they are all the correct dimensionality datasets = [(data.shape, data, label) for data, label in datasets if data.shape[1] == sizes[size_key] and label is not None] print("Found %d examples" % len(datasets)) # shuffle! random.seed(1) random.shuffle(datasets) # shapes shapes = [shape for shape, _, _ in datasets] # data dataset = [data for _, data, _ in datasets] # labels labels = numpy.asarray([label for _, _, label in datasets], dtype='int8') # make the labels into one-hot vectors if one_hot: labels = numpy_one_hot(labels, n_classes=5) train_len = int(math.floor(train_split * len(dataset))) valid_len = int(math.floor(valid_split * len(dataset))) valid_stop = train_len + valid_len print("# train: %d examples" % train_len) train_datasets = dataset[:train_len] train_labels = labels[:train_len] if valid_len > 0: valid_datasets = dataset[train_len:valid_stop] valid_labels = labels[train_len:valid_stop] else: valid_datasets = [] if train_len + valid_len < len(dataset): test_datasets = dataset[valid_stop:] test_labels = labels[valid_stop:] else: test_datasets = [] # median_train_len = int(numpy.median(numpy.asarray(shapes[:train_len]), axis=0)[0]) min_train_len = int(numpy.min(numpy.asarray(shapes[:train_len]), axis=0)[0]) max_train_len = int(numpy.max(numpy.asarray(shapes[:train_len]), axis=0)[0]) # train = numpy.array([data[:min_train_len] for data in train_datasets], dtype='float32') train = numpy.array( [numpy.pad(data, [(0, max_train_len-data.shape[0]), (0, 0)], mode='constant') for data in train_datasets], dtype='float32' ) self.train_shape = train.shape self.train = (dataset_shared(train, borrow=True), dataset_shared(train_labels, borrow=True)) if len(valid_datasets) > 0: min_valid_len = int(numpy.min(numpy.asarray(shapes[train_len:valid_stop]), axis=0)[0]) max_valid_len = int(numpy.max(numpy.asarray(shapes[train_len:valid_stop]), axis=0)[0]) # valid = numpy.array([data[:min_valid_len] for data in valid_datasets], dtype='float32') valid = numpy.array( [numpy.pad(data, [(0, max_valid_len-data.shape[0]), (0, 0)], mode='constant') for data in valid_datasets], dtype='float32' ) self.valid_shape = valid.shape self.valid = (dataset_shared(valid, borrow=True), dataset_shared(valid_labels, borrow=True)) else: self.valid = None, None self.valid_shape = None if len(test_datasets) > 0: min_test_len = int(numpy.min(numpy.asarray(shapes[valid_stop:]), axis=0)[0]) max_test_len = int(numpy.max(numpy.asarray(shapes[valid_stop:]), axis=0)[0]) # valid = numpy.array([data[:min_test_len] for data in test_datasets], dtype='float32') test = numpy.array( [numpy.pad(data, [(0, max_test_len - data.shape[0]), (0, 0)], mode='constant') for data in test_datasets], dtype='float32' ) self.test_shape = test.shape self.test = (dataset_shared(test, borrow=True), dataset_shared(test_labels, borrow=True)) else: self.test = None, None self.test_shape = None print("Train shape: %s" % str(self.train_shape)) print("Valid shape: %s" % str(self.valid_shape)) print("Test shape: %s" % str(self.test_shape)) print("Dataset %s initialized!" % size_key)