def __getitem__(self, index): """ Getter method to access the dataset and return a sample. :param index: index of the sample to return. :type index: int :return: ``DataDict({'images','targets', 'targets_label'})``, with: - images: Image, resized if indicated in ``params``, - targets: Index of the target class - targets_label: Label of the target class (cf ``self.labels``) """ img, target = self.dataset.__getitem__(index) target = torch.tensor(target) label = self.labels[target.data] data_dict = DataDict( {key: None for key in self.data_definitions.keys()}) data_dict['images'] = img data_dict['targets'] = target data_dict['targets_label'] = label return data_dict
def __getitem__(self, index): """ Getter method to access the dataset and return a sample. :param index: index of the sample to return. :type index: int :return: ``DataDict({'images','targets', 'targets_label'})``, with: - images: Image, - mask, - targets: Index of the target class - targets_label: Label of the target class (cf ``self.labels``) """ # get sample img, target = self.dataset.__getitem__(index) # get label label = self.labels[target.data] # create mask mask = torch.IntTensor(self.num_rows, 1).zero_() mask[-1, 0] = 1 data_dict = DataDict( {key: None for key in self.data_definitions.keys()}) data_dict['images'] = img.view(28, 1, 1, 28) data_dict['mask'] = mask data_dict['targets'] = target.expand((28, 1)) data_dict['targets_label'] = label return data_dict
def __getitem__(self, index): """ Getter method to access the dataset and return a sample. :param index: index of the sample to return. :type index: int :return: ``DataDict({'images','targets', 'targets_label'})``, with: - images: Image, upscaled if ``self.up_scaling`` and pad if ``self.padding``, - targets: Index of the target class - targets_label: Label of the target class (cf ``self.labels``) """ img, target = self.dataset.__getitem__(index) target = torch.tensor(target) # pad img img = F.pad(input=img, pad=self.padding, mode='constant', value=0) label = self.labels[target.data] data_dict = DataDict({key: None for key in self.data_definitions.keys()}) data_dict['images'] = img data_dict['targets'] = target data_dict['targets_label'] = label return data_dict
def __getitem__(self, index): """ Getter method to access the dataset and return a sample. :param index: index of the sample to return. :type index: int :return: ``DataDict({'sequences','targets', 'targets_label'})``, with: - sequences: sequences of pixel, - mask - targets: Index of the target class """ # get sample img, target = self.dataset.__getitem__(index) # get label label = self.labels[target.data] # create mask mask = torch.zeros(self.num_rows * self.num_columns).type( self.app_state.IntTensor) mask[-1] = 1 data_dict = DataDict( {key: None for key in self.data_definitions.keys()}) data_dict['sequences'] = img data_dict['mask'] = mask data_dict['targets'] = target data_dict['targets_label'] = label return data_dict
def __getitem__(self, index): """ Getter method to access the dataset and return a sample. :param index: index of the sample to return. :type index: int :return: ``DataDict({'images', 'mask', 'targets', 'targets_label'})``, with: - images: sequence of 'images' in [batch size, sequence length, channels, x, y] format. Single pixels, so x == y == 1 - mask - targets: Index of the target class """ # get sample img, target = self.dataset.__getitem__(index) # get label label = self.labels[target.data] # create mask mask = torch.IntTensor(self.num_rows * self.num_columns, 1).zero_() mask[-1, 0] = 1 data_dict = DataDict( {key: None for key in self.data_definitions.keys()}) data_dict['images'] = img.view(28 * 28, 1, 1, 1) data_dict['mask'] = mask data_dict['targets'] = target.expand((28 * 28, 1)) data_dict['targets_label'] = label return data_dict
def __getitem__(self, index): """ Getter method to access the dataset and return a sample. :param index: index of the sample to return. :return: DataDict({'images','questions', 'questions_length', 'questions_string', 'questions_type', 'targets', \ 'targets_string', 'index','imgfiles'}), with: - images: extracted feature maps from the raw image - questions: tensor of word indexes - questions_length: len(question) - questions_string: original question string - questions_type: category of the question (query, count...) - targets: index of the answer in the answers dictionary - targets_string: None for now - index: index of the sample - imgfiles: image filename """ # load tokenized_question, answer, string_question, image_filename from self.data question, answer, question_string, imgfile, question_type = self.data[index].values() # create the image index to retrieve the feature maps or the original image index = str(imgfile.rsplit('_', 1)[1][:-4]).zfill(6) extension = '.png' if self.raw_image else '.pt' with open(os.path.join(self.image_source, '{}_{}_{}{}'.format('CLEVR-CoGenT' if self.dataset=='CLEVR-CoGenT' else 'CLEVR', self.set, index, extension)), 'rb') as f: try: img = torch.load(f) # for feature maps img = torch.from_numpy(img).type(torch.FloatTensor).squeeze() except: img = Image.open(f).convert('RGB') # for the original images img = ToTensor()(img).type(torch.FloatTensor).squeeze() # embed question if self.embedding_type == 'random': # embed question: question = self.embed_layer(torch.LongTensor(question)).type(torch.FloatTensor) else: # embed question question = self.language.embed_sentence(question_string) question_length = question.shape[0] # return everything data_dict = DataDict({key: None for key in self.data_definitions.keys()}) data_dict['images'] = img data_dict['questions'] = question data_dict['questions_length'] = question_length data_dict['questions_string'] = question_string data_dict['questions_type'] = question_type data_dict['targets'] = answer # leave data_dict['target_string'] as None data_dict['index'] = index data_dict['imgfiles'] = imgfile return data_dict
def create_data_dict(self): """ Returns a DataDict object with keys created on the problem data_definitions and empty values (None). :return: new DataDict object. """ return DataDict({key: None for key in self.data_definitions.keys()})
def create_data_dict(self): """ Returns a :py:class:`miprometheus.utils.DataDict` object with keys created on the \ problem data_definitions and empty values (None). :return: new :py:class:`miprometheus.utils.DataDict` object. """ return DataDict({key: None for key in self.data_definitions.keys()})
def __getitem__(self, index): """ Getter that returns an individual sample from the problem's associated dataset (that can be generated \ `on-the-fly`, or retrieved from disk. It can also possibly be composed of several files.). .. note:: **To be redefined in subclasses.** .. note:: **The getter should return a DataDict: its keys should be defined by** ``self.data_definitions`` **keys.** This ensures consistency of the content of the :py:class:`miprometheus.utils.DataDict` when processing \ to the `handshake` between the :py:class:`miprometheus.problems.Problem` class and the \ :py:class:`miprometheus.models.Model` class. For more information, please see\ :py:func:`miprometheus.models.Model.handshake_definitions`. e.g.: >>> data_dict = DataDict({key: None for key in self.data_definitions.keys()}) >>> # you can now access each value by its key and assign the corresponding object (e.g. `torch.tensor` etc) >>> ... >>> return data_dict .. warning:: `Mi-Prometheus` supports multiprocessing for data loading (through the use of\ :py:class:`torch.utils.data.DataLoader`). To construct a batch (say 64 samples), the indexes are distributed among several workers (say 4, so that each worker has 16 samples to retrieve). It is best that samples can be accessed individually in the dataset folder so that there is no mutual exclusion between the workers and the performance is not degraded. If each sample is generated `on-the-fly`, this shouldn't cause a problem. There may be an issue with \ randomness. Please refer to the official PyTorch documentation for this. :param index: index of the sample to return. :type index: int :return: Empty ``DataDict``, having the same key as ``self.data_definitions``. """ return DataDict({key: None for key in self.data_definitions.keys()})
def collate_fn(self, batch): """ Combines a list of DataDict (retrieved with ``__getitem__``) into a batch. .. note:: Because each tokenized question has a variable length, padding is necessary to create batches. Hence, for a given batch, each question is padded to the length of the longest one. This length changes between batches, but this shouldn't be an issue. :param batch: list of individual samples to combine :type batch: list :return: DataDict({'images','questions', 'questions_length', 'questions_string', 'questions_type', 'targets', \ 'targets_string', 'index','imgfiles'}) """ batch_size = len(batch) # get max question length, create tensor of shape [batch_size x maxQuestionLength] & sort questions by # decreasing length max_len = max(map(lambda x: x['questions_length'], batch)) sort_by_len = sorted(batch, key=lambda x: x['questions_length'], reverse=True) # create tensor containing the embedded questions questions = torch.zeros(batch_size, max_len, self.embedding_dim).type(torch.FloatTensor) # construct the DataDict and fill it with the batch data_dict = DataDict({key: None for key in self.data_definitions.keys()}) data_dict['images'] = torch.stack([elt['images'] for elt in sort_by_len]).type(torch.FloatTensor) data_dict['questions_length'] = [elt['questions_length'] for elt in sort_by_len] data_dict['targets'] = torch.tensor([elt['targets'] for elt in sort_by_len]).type(torch.LongTensor) data_dict['questions_string'] = [elt['questions_string'] for elt in sort_by_len] data_dict['index'] = [elt['index'] for elt in sort_by_len] data_dict['imgfiles'] = [elt['imgfiles'] for elt in sort_by_len] data_dict['questions_type'] = [elt['questions_type'] for elt in sort_by_len] for i, length in enumerate(data_dict['questions_length']): # only way to do this? questions[i, :length, :] = sort_by_len[i]['questions'] data_dict['questions'] = questions return data_dict
def collate_fn(self, batch): """ Combines a list of ``DataDict`` (retrieved with ``__getitem__`` ) into a batch. .. note:: This function wraps a call to ``default_collate`` and simply returns the batch as a ``DataDict``\ instead of a dict. Multi-processing is supported as the data sources are small enough to be kept in memory\ (`self.root-dir/cifar-10-batches/data_batch_i` have a size of 31.0 MB). :param batch: list of individual ``DataDict`` samples to combine. :return: ``DataDict({'images','targets', 'targets_label'})`` containing the batch. """ return DataDict({key: value for key, value in zip(self.data_definitions.keys(), super(CIFAR10, self).collate_fn(batch).values())})
def collate_fn(self, batch): """ Combines a list of ``DataDict`` (retrieved with ``__getitem__`` ) into a batch. .. note:: This function wraps a call to ``default_collate`` and simply returns the batch as a ``DataDict``\ instead of a dict. :param batch: list of individual ``DataDict`` samples to combine. :return: ``DataDict({'images','questions', 'targets', 'targets_index', 'scenes_description'})`` containing the batch. """ return DataDict({ key: value for key, value in zip( self.data_definitions.keys(), super(SortOfCLEVR, self).collate_fn(batch).values()) })
def collate_fn(self, batch): """ Combines a list of ``DataDict`` (retrieved with ``__getitem__`` ) into a batch. .. note:: This function wraps a call to ``default_collate`` and simply returns the batch as a ``DataDict``\ instead of a dict. Multi-processing is supported as the data sources are small enough to be kept in memory\ (`training.pt` has a size of 47.5 MB). :param batch: list of individual ``DataDict`` samples to combine. :return: ``DataDict({'sequences','targets', 'targets_label'})`` containing the batch. """ return DataDict({ key: value for key, value in zip( self.data_definitions.keys(), super(SequentialPixelMNIST, self).collate_fn(batch).values()) })
def __getitem__(self, index): """ Getter method to access the dataset and return a sample. .. warning:: **HDF5 does not support multi threaded data access with num_workers > 1 on the data loading.** A way around this is to move every call for opening the HDF5 file to this ``__getitem__`` method. See https://discuss.pytorch.org/t/hdf5-multi-threaded-alternative/6189/9 for more info. :param index: index of the sample to return. :return: DataDict({'images','questions', 'targets', 'targets_index', 'scenes_description'}), with: - images: images (``self.img_size``) - questions: encoded questions - targets: one-hot encoded answers - targets_index: index of the answers - scenes_description: Scene description. """ # load the file data = h5py.File(self.filename, 'r') sample = data[str(index)] data_dict = DataDict( {key: None for key in self.data_definitions.keys()}) data_dict['images'] = (sample['image'].value / 255).transpose(2, 1, 0) data_dict['questions'] = sample['question'].value.astype(np.float32) data_dict['targets_classes'] = sample['answer'].value.astype( np.float32) data_dict['targets'] = np.argmax(data_dict['targets_classes']) data_dict['scenes_description'] = sample['scene_description'].value return data_dict
def __getitem__(self, index): """ Retrieves a sample from ``self.tensor_pairs`` and get the associated strings from ``self.pairs``. :param index: index of the sample to return. :type index: int :return: DataDict({'inputs', 'inputs_length', 'inputs_text' 'targets', 'targets_length', 'targets_text'}). """ # get tensors and strings input_tensor, target_tensor = self.tensor_pairs[index] input_text, target_text = self.pairs[index] # embed the input sentence: input_tensor = self.input_embed_layer( torch.LongTensor(input_tensor)).type(torch.FloatTensor) # embed the output sentence: target_tensor = self.output_embed_layer( torch.LongTensor(target_tensor)).type(torch.FloatTensor) # return data_dict data_dict = DataDict( {key: None for key in self.data_definitions.keys()}) data_dict['inputs'] = input_tensor data_dict['inputs_length'] = len(input_tensor) data_dict['inputs_text'] = input_text data_dict['targets'] = target_tensor data_dict['targets_length'] = len(target_tensor) data_dict['targets_text'] = target_text return data_dict
'store_bit': 0, 'recall_bit': 1 } input_size = problem_default_values['input_item_size'] output_size = problem_default_values['output_item_size'] # Construct our model by instantiating the class defined above model = NTM(params, problem_default_values) # Check for different seq_lengts and batch_sizes. for i in range(2): # Create random Tensors to hold inputs and outputs x = torch.randn(batch_size, seq_length, input_size) y = torch.randn(batch_size, seq_length, output_size) dt = DataDict({'sequences': x, 'targets': y}) # Test forward pass. logger.info("------- forward -------") y_pred = model(dt) logger.info("------- result -------") logger.info("input {}:\n {}".format(x.size(), x)) logger.info("target.size():\n {}".format(y.size())) logger.info("prediction {}:\n {}".format(y_pred.size(), y_pred)) # Plot it and check whether window was closed or not. if model.plot(dt, y_pred): break # Change batch size and seq_length.
def __getitem__(self, index): """ Getter that returns one individual sample generated on-the-fly .. note:: The sequence length is drawn randomly between ``self.min_sequence_length`` and \ ``self.max_sequence_length``. :param index: index of the sample to return. :return: DataDict({'sequences', 'sequences_length', 'targets', 'mask', 'num_subsequences'}), with: - sequences: [SEQ_LENGTH, CONTROL_BITS+DATA_BITS]. SEQ_LENGTH depends on number of sub-sequences and its lengths. - **sequences_length: random value between self.min_sequence_length and self.max_sequence_length** - targets: [SEQ_LENGTH, DATA_BITS], - mask: [SEQ_LENGTH] - num_subsequences: number of subsequences """ # TODO: THE DOCUMENTATION NEEDS TO BE UPDATED # TODO: This is commented for now to avoid the issue with `add_ctrl` and `augment` in AlgorithmicSeqToSeqProblem # TODO: NOT SURE THAT THIS FN IS WORKING WELL (WITHOUT THE PRESENCE OF THE BATCH DIMENSION) ''' # define control channel markers pos = [0, 0] ctrl_data = [0, 0] ctrl_dummy = [0, 1] ctrl_inter = [0, 1] # assign markers markers = ctrl_data, ctrl_dummy, pos # number sub sequences num_sub_seq = np.random.randint( self.num_subseq_min, self.num_subseq_max + 1) # set the sequence length of each marker seq_length = np.random.randint(low=self.min_sequence_length, high=self.max_sequence_length + 1, size=num_sub_seq) # generate subsequences for x and y x = [np.random.binomial(1, self.bias, (n, self.data_bits)) for n in seq_length] x_last = [a[None, -1, :] for a in x] # create the target seq_length_tdummies = sum(seq_length) + seq_length.shape[0] + 1 dummies_target = np.zeros([seq_length_tdummies, self.data_bits], dtype=np.float32) targets = np.concatenate([dummies_target] + x_last, axis=0) # data of x and dummies xx = [self.augment(seq, markers, ctrl_start=[1, 0], add_marker_data=True, add_marker_dummy=False) for seq in x] # data of x data_1 = [arr for a in xx for arr in a[:-1]] # this is a marker between sub sequence x and dummies inter_seq = self.add_ctrl(np.zeros((1, self.data_bits)), ctrl_inter, pos) # dummies of x x_dummy_last = [a[None, -1, :] for b in xx for a in b[-1:]] # concatenate all parts of the inputs inputs = np.concatenate(data_1 + [inter_seq] + x_dummy_last, axis=0) # PyTorch variables inputs = torch.from_numpy(inputs).type(self.app_state.dtype) targets = torch.from_numpy(targets).type(self.app_state.dtype) # TODO: batch might have different sequence lengths mask_all = inputs[..., 0:self.control_bits] == 1 mask = mask_all[..., 0] for i in range(self.control_bits): mask = mask_all[..., i] * mask # TODO: fix the batch indexing # rest channel values of data dummies inputs[mask[0], 0:self.control_bits] = 0 # Return data_dict. data_dict = DataDict({key: None for key in self.data_definitions.keys()}) data_dict['sequences'] = inputs data_dict['sequences_length'] = max(seq_length) data_dict['targets'] = targets data_dict['mask'] = mask data_dict['num_subsequences'] = num_sub_seq ''' return DataDict({key: None for key in self.data_definitions.keys()}) #data_dict
def collate_fn(self, batch): """ Generates a batch of samples on-the-fly .. warning:: Because of the fact that the sequence length is randomly drawn between ``self.min_sequence_length`` and \ ``self.max_sequence_length`` and then fixed for one given batch (**but varies between batches**), \ we cannot follow the scheme `merge together individuals samples that can be retrieved in parallel with\ several workers.` Indeed, each sample could have a different sequence length, and merging them together\ would then not be possible (we cannot have variable-sequence-length samples within one batch \ without padding). Hence, ``collate_fn`` generates on-the-fly a batch of samples, all having the same length (initially\ randomly selected). The samples created by ``__getitem__`` are simply not used in this function. :param batch: Should be a list of DataDict retrieved by `__getitem__`, each containing tensors, numbers,\ dicts or lists. --> **Not Used Here!** :return: DataDict({'sequences', 'sequences_length', 'targets', 'mask', 'num_subsequences'}), with: - sequences: [BATCH_SIZE, SEQ_LENGTH, CONTROL_BITS+DATA_BITS], - **sequences_length: random value between self.min_sequence_length and self.max_sequence_length** - targets: [BATCH_SIZE, SEQ_LENGTH, DATA_BITS], - mask: [BATCH_SIZE, SEQ_LENGTH] - num_subsequences: number of subsequences # TODO: THE DOCUMENTATION NEEDS TO BE UPDATED """ # get the batch_size batch_size = len(batch) # define control channel markers pos = [0, 0] ctrl_data = [0, 0] ctrl_dummy = [0, 1] ctrl_inter = [0, 1] # assign markers markers = ctrl_data, ctrl_dummy, pos # number sub sequences num_sub_seq = np.random.randint(self.num_subseq_min, self.num_subseq_max + 1) # set the sequence length of each marker seq_length = np.random.randint(low=self.min_sequence_length, high=self.max_sequence_length + 1, size=num_sub_seq) # generate subsequences for x and y x = [np.random.binomial(1, self.bias, (batch_size, n, self.data_bits)) for n in seq_length] x_last = [a[:, None, -1, :] for a in x] # create the target seq_length_tdummies = sum(seq_length) + seq_length.shape[0] + 1 dummies_target = np.zeros([batch_size, seq_length_tdummies, self.data_bits], dtype=np.float32) targets = np.concatenate([dummies_target] + x_last, axis=1) # data of x and dummies xx = [self.augment(seq, markers, ctrl_start=[1, 0], add_marker_data=True, add_marker_dummy=False) for seq in x] # data of x data_1 = [arr for a in xx for arr in a[:-1]] # this is a marker between sub sequence x and dummies inter_seq = self.add_ctrl(np.zeros((batch_size, 1, self.data_bits)), ctrl_inter, pos) # dummies of x x_dummy_last = [a[:, None, -1, :] for b in xx for a in b[-1:]] # concatenate all parts of the inputs inputs = np.concatenate(data_1 + [inter_seq] + x_dummy_last, axis=1) # PyTorch variables inputs = torch.from_numpy(inputs).type(self.app_state.dtype) targets = torch.from_numpy(targets).type(self.app_state.dtype) # TODO: batch might have different sequence lengths mask_all = inputs[..., 0:self.control_bits] == 1 mask = mask_all[..., 0] for i in range(self.control_bits): mask = mask_all[..., i] * mask # TODO: fix the batch indexing # rest channel values of data dummies inputs[:, mask[0], 0:self.control_bits] = 0 # Return data_dict. data_dict = DataDict({key: None for key in self.data_definitions.keys()}) data_dict['sequences'] = inputs data_dict['sequences_length'] = max(seq_length) data_dict['targets'] = targets data_dict['mask'] = mask data_dict['num_subsequences'] = num_sub_seq return data_dict
def collate_fn(self, batch): """ Generates a batch of samples on-the-fly .. warning:: Because of the fact that the sequence length is randomly drawn between ``self.min_sequence_length`` and \ ``self.max_sequence_length`` and then fixed for one given batch (**but varies between batches**), \ we cannot follow the scheme `merge together individuals samples that can be retrieved in parallel with\ several workers.` Indeed, each sample could have a different sequence length, and merging them together\ would then not be possible (we cannot have variable-sequence-length samples within one batch \ without padding). Hence, ``collate_fn`` generates on-the-fly a batch of samples, all having the same length (initially\ randomly selected). The samples created by ``__getitem__`` are simply not used in this function. :param batch: Should be a list of DataDict retrieved by `__getitem__`, each containing tensors, numbers,\ dicts or lists. --> **Not Used Here!** :return: DataDict({'sequences', 'sequences_length', 'targets', 'mask', 'num_subsequences'}), with: - sequences: [BATCH_SIZE, 2*SEQ_LENGTH+2, CONTROL_BITS+DATA_BITS], - **sequences_length: random value between self.min_sequence_length and self.max_sequence_length** - targets: [BATCH_SIZE, 2*SEQ_LENGTH+2, DATA_BITS], - mask: [BATCH_SIZE, [2*SEQ_LENGTH+2] - num_subsequences: 1 """ # get the batch_size batch_size = len(batch) # Set sequence length. seq_length = np.random.randint(self.min_sequence_length, self.max_sequence_length + 1) # Generate batch of random bit sequences [BATCH_SIZE x SEQ_LENGTH X # DATA_BITS] bit_seq = np.random.binomial(1, self.bias, (batch_size, seq_length, self.data_bits)) # Generate input: [BATCH_SIZE, 2*SEQ_LENGTH+2, CONTROL_BITS+DATA_BITS] inputs = np.zeros([ batch_size, 2 * seq_length + 2, self.control_bits + self.data_bits ], dtype=np.float32) # Set start control marker. inputs[:, 0, 0] = 1 # Memorization bit. # Set bit sequence. inputs[:, 1:seq_length + 1, self.control_bits:self.control_bits + self.data_bits] = bit_seq # Set end control marker. inputs[:, seq_length + 1, 1] = 1 # Recall bit. # Generate target: [BATCH_SIZE, 2*SEQ_LENGTH+2, DATA_BITS] (only data # bits!) targets = np.zeros([batch_size, 2 * seq_length + 2, self.data_bits], dtype=np.float32) # Set bit sequence. # Rotate sequence by shifting the items to right: seq >> num_items # i.e num_items = 2 -> seq_items >> 2 # and num_items = -1 -> seq_items << 1 # For that reason we must change the sign of num_items num_items = -self.num_items # Check if we are using relative or absolute rotation. if -1 < num_items < 1: num_items = num_items * seq_length # Round items shift to int. num_items = np.round(num_items) # Modulo items shift with length of the sequence. num_items = int(num_items % seq_length) # Apply items shift bit_seq = np.concatenate( (bit_seq[:, num_items:, :], bit_seq[:, :num_items, :]), axis=1) targets[:, seq_length + 2:, :] = bit_seq # Generate target mask: [BATCH_SIZE, 2*SEQ_LENGTH+2] mask = torch.zeros([batch_size, 2 * seq_length + 2 ]).type(self.app_state.ByteTensor) mask[:, seq_length + 2:] = 1 # PyTorch variables. ptinputs = torch.from_numpy(inputs).type(self.app_state.dtype) pttargets = torch.from_numpy(targets).type(self.app_state.dtype) # Return data_dict. data_dict = DataDict( {key: None for key in self.data_definitions.keys()}) data_dict['sequences'] = ptinputs data_dict['sequences_length'] = seq_length data_dict['targets'] = pttargets data_dict['mask'] = mask data_dict['num_subsequences'] = 1 return data_dict
def __getitem__(self, index): """ Getter that returns one individual sample generated on-the-fly .. note:: The sequence length is drawn randomly between ``self.min_sequence_length`` and \ ``self.max_sequence_length``. :param index: index of the sample to return. :return: DataDict({'sequences', 'sequences_length', 'targets', 'mask', 'num_subsequences'}), with: - sequences: [2*SEQ_LENGTH+2, CONTROL_BITS+DATA_BITS], - **sequences_length: random value between self.min_sequence_length and self.max_sequence_length** - targets: [2*SEQ_LENGTH+2, DATA_BITS], - mask: [2*SEQ_LENGTH+2] - num_subsequences: 1 """ # Set sequence length. seq_length = np.random.randint(self.min_sequence_length, self.max_sequence_length + 1) # Generate batch of random bit sequences [SEQ_LENGTH X DATA_BITS] bit_seq = np.random.binomial(1, self.bias, (seq_length, self.data_bits)) # Generate input: [2*SEQ_LENGTH+2, CONTROL_BITS+DATA_BITS] inputs = np.zeros( [2 * seq_length + 2, self.control_bits + self.data_bits], dtype=np.float32) # Set start control marker. inputs[0, 0] = 1 # Memorization bit. # Set bit sequence. inputs[1:seq_length + 1, self.control_bits:self.control_bits + self.data_bits] = bit_seq # Set end control marker. inputs[seq_length + 1, 1] = 1 # Recall bit. # Generate target: [2*SEQ_LENGTH+2, DATA_BITS] (only data bits!) targets = np.zeros([2 * seq_length + 2, self.data_bits], dtype=np.float32) # Set bit sequence. # Rotate sequence by shifting the items to right: seq >> num_items # i.e num_items = 2 -> seq_items >> 2 # and num_items = -1 -> seq_items << 1 # For that reason we must change the sign of num_items num_items = -self.num_items # Check if we are using relative or absolute rotation. if -1 < num_items < 1: num_items = num_items * seq_length # Round items shift to int. num_items = np.round(num_items) # Modulo items shift with length of the sequence. num_items = int(num_items % seq_length) # Apply items shift bit_seq = np.concatenate( (bit_seq[num_items:, :], bit_seq[:num_items, :]), axis=0) targets[seq_length + 2:, :] = bit_seq # Generate target mask: [2*SEQ_LENGTH+2] mask = torch.zeros([2 * seq_length + 2 ]).type(self.app_state.ByteTensor) mask[seq_length + 2:] = 1 # PyTorch variables. ptinputs = torch.from_numpy(inputs).type(self.app_state.dtype) pttargets = torch.from_numpy(targets).type(self.app_state.dtype) # Return data_dict. data_dict = DataDict( {key: None for key in self.data_definitions.keys()}) data_dict['sequences'] = ptinputs data_dict['sequences_length'] = seq_length data_dict['targets'] = pttargets data_dict['mask'] = mask data_dict['num_subsequences'] = 1 return data_dict
file_folder_to_check = os.path.expanduser(file_folder_to_check) if not (os.path.isfile(file_folder_to_check) or os.path.isdir(file_folder_to_check)): self.logger.info('Downloading {}'.format(url)) urllib.request.urlretrieve(url, os.path.expanduser(download_name), reporthook) return True else: self.logger.info('Dataset found at {}'.format(file_folder_to_check)) return False if __name__ == '__main__': """Unit test for Problem and DataDict""" from miprometheus.utils.param_interface import ParamInterface params = ParamInterface() problem = Problem(params) problem.data_definitions = {'inputs': {'size': [-1, -1], 'type': [torch.Tensor]}, 'targets': {'size': [-1], 'type': [torch.Tensor]} } problem.loss_function = torch.nn.CrossEntropyLoss() # torch.nn.L1Loss, torch.nn.TripletMarginLoss datadict = DataDict({key: None for key in problem.data_definitions.keys()}) # datadict['inputs'] = torch.ones([64, 20, 512]).type(torch.FloatTensor) # datadict['targets'] = torch.ones([64, 20]).type(torch.FloatTensor) # print(repr(datadict))
def collate_fn(self, batch): """ Combines a list of DataDict (retrieved with ``__getitem__``) into a batch. .. note:: Because each tokenized sentence has a variable length, padding is necessary to create batches. Hence, for a given batch, each sentence is padded to the length of the longest one. **The batch is sorted decreasingly as a function of the input sentences length.** This length changes between batches, but this shouldn't be an issue. :param batch: Individual samples to combine :type batch: list :return: ``DataDict({'inputs', 'inputs_length', 'inputs_text' 'targets', 'targets_length', 'targets_text'})``\ containing the batch. """ batch_size = len(batch) # get max input sentence length, create tensor of shape [batch_size x max_input_length] & sort inputs by # decreasing length max_input_len = max(map(lambda x: x['inputs_length'], batch)) sort_by_len = sorted(batch, key=lambda x: x['inputs_length'], reverse=True) # create tensor containing the embedded input sentences inputs = torch.zeros(batch_size, max_input_len, self.embedding_dim).type(torch.FloatTensor) # get max output sentence length max_output_len = max(map(lambda x: x['targets_length'], batch)) # create tensor containing the embedded output sentences outputs = torch.zeros(batch_size, max_output_len, self.embedding_dim).type(torch.FloatTensor) # construct the DataDict and fill it with the batch data_dict = DataDict( {key: None for key in self.data_definitions.keys()}) data_dict['inputs_length'] = [ elt['inputs_length'] for elt in sort_by_len ] data_dict['inputs_text'] = [elt['inputs_text'] for elt in sort_by_len] data_dict['targets_length'] = [ elt['targets_length'] for elt in sort_by_len ] data_dict['targets_text'] = [ elt['targets_text'] for elt in sort_by_len ] for i, length in enumerate( data_dict['inputs_length']): # only way to do this? inputs[i, :length, :] = sort_by_len[i]['inputs'] outputs[i, :data_dict['targets_length'][i], :] = sort_by_len[i][ 'targets'] data_dict['inputs'] = inputs data_dict['targets'] = outputs return data_dict
def __getitem__(self, index): """ Getter that returns one individual sample generated on-the-fly .. note:: The sequence length is drawn randomly between ``self.min_sequence_length`` and \ ``self.max_sequence_length``. :param index: index of the sample to return. :return: DataDict({'sequences', 'sequences_length', 'targets', 'mask', 'num_subsequences'}), with: - sequences: [2*SEQ_LENGTH+2, CONTROL_BITS+DATA_BITS], - **sequences_length: random value between self.min_sequence_length and self.max_sequence_length** - targets: [2*SEQ_LENGTH+2, DATA_BITS], - mask: [2*SEQ_LENGTH+2] - num_subsequences: 1 """ # Set sequence length seq_length = np.random.randint(self.min_sequence_length, self.max_sequence_length + 1) # Generate batch of random bit sequences [SEQ_LENGTH X # DATA_BITS] bit_seq = np.random.binomial(1, self.bias, (seq_length, self.data_bits)) # Generate input: [2*SEQ_LENGTH+2, CONTROL_BITS+DATA_BITS] inputs = np.zeros( [2 * seq_length + 2, self.control_bits + self.data_bits], dtype=np.float32) # Set start control marker. inputs[0, 0] = 1 # Memorization bit. # Set bit sequence. inputs[1:seq_length + 1, self.control_bits:self.control_bits + self.data_bits] = bit_seq # Set end control marker. inputs[seq_length + 1, 1] = 1 # Recall bit. # Generate target: [2*SEQ_LENGTH+2, DATA_BITS] (only data # bits!) targets = np.zeros([2 * seq_length + 2, self.data_bits], dtype=np.float32) # Set target bit sequence - logical not. targets[seq_length + 2:, :] = np.logical_not(bit_seq) # Generate target mask: [2*SEQ_LENGTH+2] mask = torch.zeros([2 * seq_length + 2 ]).type(self.app_state.ByteTensor) mask[seq_length + 2:] = 1 # PyTorch variables. ptinputs = torch.from_numpy(inputs).type(self.app_state.dtype) pttargets = torch.from_numpy(targets).type(self.app_state.dtype) # Return data_dict. data_dict = DataDict( {key: None for key in self.data_definitions.keys()}) data_dict['sequences'] = ptinputs data_dict['sequences_length'] = seq_length data_dict['targets'] = pttargets data_dict['mask'] = mask data_dict['num_subsequences'] = 1 return data_dict
def collate_fn(self, batch): """ Generates a batch of samples on-the-fly .. warning:: Because of the fact that the sequence length is randomly drawn between ``self.min_sequence_length`` and \ ``self.max_sequence_length`` and then fixed for one given batch (**but varies between batches**), \ we cannot follow the scheme `merge together individuals samples that can be retrieved in parallel with\ several workers.` Indeed, each sample could have a different sequence length, and merging them together\ would then not be possible (we cannot have variable-sequence-length samples within one batch \ without padding). Hence, ``collate_fn`` generates on-the-fly a batch of samples, all having the same length (initially\ randomly selected). The samples created by ``__getitem__`` are simply not used in this function. :param batch: Should be a list of DataDict retrieved by `__getitem__`, each containing tensors, numbers,\ dicts or lists. --> **Not Used Here!** :return: DataDict({'sequences', 'sequences_length', 'targets', 'mask', 'num_subsequences'}), with: - sequences: [BATCH_SIZE, 2*SEQ_LENGTH+2, CONTROL_BITS+DATA_BITS], - **sequences_length: random value between self.min_sequence_length and self.max_sequence_length** - targets: [BATCH_SIZE, 2*SEQ_LENGTH+2, DATA_BITS], - mask: [BATCH_SIZE, [2*SEQ_LENGTH+2] - num_subsequences: 1 pattern of inputs: # x1 % y1 # x2 % y2 ... # xn % yn & d $ d` pattern of target: dummies ... ... ... ... yn all(xi) mask: used to mask the data part of the target. xi, yi, and d(d'): sub sequences x of random length, sub sequence y of random length and dummies. # TODO: THE DOCUMENTATION NEEDS TO BE UPDATED & IMPROVED """ # get the batch_size batch_size = len(batch) # define control channel markers pos = [0, 0, 0, 0] ctrl_data = [0, 0, 0, 0] ctrl_dummy = [0, 0, 1, 0] ctrl_inter = [0, 0, 0, 1] # assign markers markers = ctrl_data, ctrl_dummy, pos # number of sub_sequences nb_sub_seq_a = np.random.randint(self.num_subseq_min, self.num_subseq_max + 1) # might be different in future implementation nb_sub_seq_b = nb_sub_seq_a # set the sequence length of each marker seq_lengths_a = np.random.randint(low=self.min_sequence_length, high=self.max_sequence_length + 1, size=nb_sub_seq_a) seq_lengths_b = np.random.randint(low=self.min_sequence_length, high=self.max_sequence_length + 1, size=nb_sub_seq_b) # generate subsequences for x and y x = [ np.random.binomial(1, self.bias, (batch_size, n, self.data_bits)) for n in seq_lengths_a ] y = [ np.random.binomial(1, self.bias, (batch_size, n, self.data_bits)) for n in seq_lengths_b ] # create the target target_wo_dummies = np.concatenate([y[-1]] + x, axis=1) # add marker at the begging of x and dummies xx = [ self.augment(seq, markers, ctrl_start=[1, 0, 0, 0], add_marker_data=True, add_marker_dummy=False) for seq in x ] # add marker at the begging of y and dummies of same length, also a # marker at the begging of dummies is added yy = [ self.augment(seq, markers, ctrl_start=[0, 1, 0, 0], add_marker_data=True) for seq in y ] # this is a marker to separate dummies of x and y at the end of the # sequence inter_seq = self.add_ctrl(np.zeros((batch_size, 1, self.data_bits)), ctrl_inter, pos) # data which contains all xs and all ys data_1 = [arr for a, b in zip(xx, yy) for arr in a[:-1] + b[:-1]] # dummies of y and xs data_2 = [yy[-1][-1]] + [inter_seq] + [a[-1] for a in xx] # concatenate all parts of the inputs inputs = np.concatenate(data_1 + data_2, axis=1) # PyTorch variables inputs = torch.from_numpy(inputs).type(self.app_state.dtype) target_wo_dummies = torch.from_numpy(target_wo_dummies).type( self.app_state.dtype) # create the mask mask_all = inputs[:, :, 0:self.control_bits] == 1 mask = mask_all[..., 0] for i in range(self.control_bits): mask = mask_all[..., i] * mask # rest ctrl channel of dummies inputs[:, mask[0], 0:self.control_bits] = 0 # Create the target with the dummies targets = torch.zeros_like(inputs[:, :, self.control_bits:]) targets[:, mask[0], :] = target_wo_dummies # Return data_dict. data_dict = DataDict( {key: None for key in self.data_definitions.keys()}) data_dict['sequences'] = inputs data_dict['sequences_length'] = max(seq_lengths_a) data_dict['targets'] = targets data_dict['mask'] = mask data_dict['num_subsequences'] = nb_sub_seq_a + nb_sub_seq_b return data_dict
def __getitem__(self, index): """ Getter that returns one individual sample generated on-the-fly .. note:: The sequence length is drawn randomly between ``self.min_sequence_length`` and \ ``self.max_sequence_length``. :param index: index of the sample to return. :return: DataDict({'sequences', 'sequences_length', 'targets', 'mask', 'num_subsequences'}), with: - sequences: [SEQ_LENGTH, CONTROL_BITS+DATA_BITS], - **sequences_length: random value between self.min_sequence_length and self.max_sequence_length** - targets: [SEQ_LENGTH, DATA_BITS], - mask: [SEQ_LENGTH] - num_subsequences: 1 pattern of inputs: # x1 % y1 # x2 % y2 ... # xn % yn & d $ d` pattern of target: dummies ... ... ... ... yn all(xi) mask: used to mask the data part of the target. xi, yi, and d(d'): sub sequences x of random length, sub sequence y of random length and dummies. # TODO: THE DOCUMENTATION NEEDS TO BE UPDATED # TODO: This is commented for now to avoid the issue with `add_ctrl` and `augment` in AlgorithmicSeqToSeqProblem # TODO: NOT SURE THAT THIS FN IS WORKING WELL (WITHOUT THE PRESENCE OF THE BATCH DIMENSION) """ ''' # define control channel markers pos = [0, 0, 0, 0] ctrl_data = [0, 0, 0, 0] ctrl_dummy = [0, 0, 1, 0] ctrl_inter = [0, 0, 0, 1] # assign markers markers = ctrl_data, ctrl_dummy, pos # number of sub_sequences nb_sub_seq_a = np.random.randint( self.num_subseq_min, self.num_subseq_max + 1) # might be different in future implementation nb_sub_seq_b = nb_sub_seq_a # set the sequence length of each marker seq_lengths_a = np.random.randint( low=self.min_sequence_length, high=self.max_sequence_length + 1, size=nb_sub_seq_a) seq_lengths_b = np.random.randint( low=self.min_sequence_length, high=self.max_sequence_length + 1, size=nb_sub_seq_b) # generate subsequences for x and y x = [ np.random.binomial( 1, self.bias, (n, self.data_bits)) for n in seq_lengths_a] y = [ np.random.binomial( 1, self.bias, (n, self.data_bits)) for n in seq_lengths_b] # create the target target_wo_dummies = np.concatenate([y[-1]] + x, axis=0) # add marker at the begging of x and dummies xx = [ self.augment( seq, markers, ctrl_start=[ 1, 0, 0, 0], add_marker_data=True, add_marker_dummy=False) for seq in x] # add marker at the begging of y and dummies of same length, also a # marker at the begging of dummies is added yy = [self.augment(seq, markers, ctrl_start=[ 0, 1, 0, 0], add_marker_data=True) for seq in y] # this is a marker to separate dummies of x and y at the end of the # sequence inter_seq = self.add_ctrl( np.zeros((1, self.data_bits)), ctrl_inter, pos) # data which contains all xs and all ys data_1 = [arr for a, b in zip(xx, yy) for arr in a[:-1] + b[:-1]] # dummies of y and xs data_2 = [yy[-1][-1]] + [inter_seq] + [a[-1] for a in xx] # concatenate all parts of the inputs inputs = np.concatenate(data_1 + data_2, axis=0) # PyTorch variables inputs = torch.from_numpy(inputs).type(self.app_state.dtype) target_wo_dummies = torch.from_numpy( target_wo_dummies).type(self.app_state.dtype) # create the mask mask_all = inputs[:, 0:self.control_bits] == 1 mask = mask_all[..., 0] for i in range(self.control_bits): mask = mask_all[..., i] * mask # rest ctrl channel of dummies inputs[mask[0], 0:self.control_bits] = 0 # Create the target with the dummies targets = torch.zeros_like(inputs[:, self.control_bits:]) targets[mask[0], :] = target_wo_dummies # Return data_dict. data_dict = DataDict({key: None for key in self.data_definitions.keys()}) data_dict['sequences'] = inputs data_dict['sequences_length'] = max(seq_lengths_a) data_dict['targets'] = targets data_dict['mask'] = mask data_dict['num_subsequences'] = nb_sub_seq_a + nb_sub_seq_b ''' return DataDict({key: None for key in self.data_definitions.keys()}) # data_dict