コード例 #1
0
    def __getitem__(self, index):
        """
        Getter method to access the dataset and return a sample.

        :param index: index of the sample to return.
        :type index: int

        :return: ``DataDict({'images','targets', 'targets_label'})``, with:

            - images: Image, resized if indicated in ``params``,
            - targets: Index of the target class
            - targets_label: Label of the target class (cf ``self.labels``)


        """

        img, target = self.dataset.__getitem__(index)
        target = torch.tensor(target)

        label = self.labels[target.data]

        data_dict = DataDict(
            {key: None
             for key in self.data_definitions.keys()})
        data_dict['images'] = img
        data_dict['targets'] = target
        data_dict['targets_label'] = label

        return data_dict
コード例 #2
0
    def __getitem__(self, index):
        """
        Getter method to access the dataset and return a sample.

        :param index: index of the sample to return.
        :type index: int

        :return: ``DataDict({'images','targets', 'targets_label'})``, with:

            - images: Image,
            - mask,
            - targets: Index of the target class
            - targets_label: Label of the target class (cf ``self.labels``)


        """
        # get sample
        img, target = self.dataset.__getitem__(index)

        # get label
        label = self.labels[target.data]

        # create mask
        mask = torch.IntTensor(self.num_rows, 1).zero_()
        mask[-1, 0] = 1

        data_dict = DataDict(
            {key: None
             for key in self.data_definitions.keys()})
        data_dict['images'] = img.view(28, 1, 1, 28)
        data_dict['mask'] = mask
        data_dict['targets'] = target.expand((28, 1))
        data_dict['targets_label'] = label

        return data_dict
コード例 #3
0
    def __getitem__(self, index):
        """
        Getter method to access the dataset and return a sample.

        :param index: index of the sample to return.
        :type index: int

        :return: ``DataDict({'images','targets', 'targets_label'})``, with:

            - images: Image, upscaled if ``self.up_scaling`` and pad if ``self.padding``,
            - targets: Index of the target class
            - targets_label: Label of the target class (cf ``self.labels``)


        """

        img, target = self.dataset.__getitem__(index)
        target = torch.tensor(target)

        # pad img
        img = F.pad(input=img, pad=self.padding, mode='constant', value=0)

        label = self.labels[target.data]

        data_dict = DataDict({key: None for key in self.data_definitions.keys()})
        data_dict['images'] = img
        data_dict['targets'] = target
        data_dict['targets_label'] = label

        return data_dict
コード例 #4
0
    def __getitem__(self, index):
        """
        Getter method to access the dataset and return a sample.

        :param index: index of the sample to return.
        :type index: int

        :return: ``DataDict({'sequences','targets', 'targets_label'})``, with:

            - sequences: sequences of pixel,
            - mask
            - targets: Index of the target class

        """
        # get sample
        img, target = self.dataset.__getitem__(index)

        # get label
        label = self.labels[target.data]

        # create mask
        mask = torch.zeros(self.num_rows * self.num_columns).type(
            self.app_state.IntTensor)
        mask[-1] = 1

        data_dict = DataDict(
            {key: None
             for key in self.data_definitions.keys()})
        data_dict['sequences'] = img
        data_dict['mask'] = mask
        data_dict['targets'] = target
        data_dict['targets_label'] = label

        return data_dict
コード例 #5
0
    def __getitem__(self, index):
        """
        Getter method to access the dataset and return a sample.

        :param index: index of the sample to return.
        :type index: int

        :return: ``DataDict({'images', 'mask', 'targets', 'targets_label'})``, with:

            - images: sequence of 'images' in [batch size, sequence length, channels, x, y] format. Single pixels, so x == y == 1
            - mask
            - targets: Index of the target class

        """
        # get sample
        img, target = self.dataset.__getitem__(index)

        # get label
        label = self.labels[target.data]

        # create mask
        mask = torch.IntTensor(self.num_rows * self.num_columns, 1).zero_()
        mask[-1, 0] = 1

        data_dict = DataDict(
            {key: None
             for key in self.data_definitions.keys()})
        data_dict['images'] = img.view(28 * 28, 1, 1, 1)
        data_dict['mask'] = mask
        data_dict['targets'] = target.expand((28 * 28, 1))
        data_dict['targets_label'] = label

        return data_dict
コード例 #6
0
    def __getitem__(self, index):
        """
        Getter method to access the dataset and return a sample.

        :param index: index of the sample to return.

        :return: DataDict({'images','questions', 'questions_length', 'questions_string', 'questions_type', 'targets', \
        'targets_string', 'index','imgfiles'}), with:

            - images: extracted feature maps from the raw image
            - questions: tensor of word indexes
            - questions_length: len(question)
            - questions_string: original question string
            - questions_type: category of the question (query, count...)
            - targets: index of the answer in the answers dictionary
            - targets_string: None for now
            - index: index of the sample
            - imgfiles: image filename

        """
        # load tokenized_question, answer, string_question, image_filename from self.data
        question, answer, question_string, imgfile, question_type = self.data[index].values()

        # create the image index to retrieve the feature maps or the original image
        index = str(imgfile.rsplit('_', 1)[1][:-4]).zfill(6)
        extension = '.png' if self.raw_image else '.pt'
        with open(os.path.join(self.image_source, '{}_{}_{}{}'.format('CLEVR-CoGenT' if self.dataset=='CLEVR-CoGenT' else 'CLEVR',
                                                                      self.set, index, extension)), 'rb') as f:
            try:
                img = torch.load(f)  # for feature maps
                img = torch.from_numpy(img).type(torch.FloatTensor).squeeze()
            except:
                img = Image.open(f).convert('RGB')  # for the original images
                img = ToTensor()(img).type(torch.FloatTensor).squeeze()

        # embed question
        if self.embedding_type == 'random':
            # embed question:
            question = self.embed_layer(torch.LongTensor(question)).type(torch.FloatTensor)

        else:
            # embed question
            question = self.language.embed_sentence(question_string)

        question_length = question.shape[0]

        # return everything
        data_dict = DataDict({key: None for key in self.data_definitions.keys()})

        data_dict['images'] = img
        data_dict['questions'] = question
        data_dict['questions_length'] = question_length
        data_dict['questions_string'] = question_string
        data_dict['questions_type'] = question_type
        data_dict['targets'] = answer
        # leave data_dict['target_string'] as None
        data_dict['index'] = index
        data_dict['imgfiles'] = imgfile

        return data_dict
コード例 #7
0
    def create_data_dict(self):
        """
        Returns a DataDict object with keys created on the problem data_definitions and empty values (None).

        :return: new DataDict object.

        """
        return DataDict({key: None for key in self.data_definitions.keys()})
コード例 #8
0
    def create_data_dict(self):
        """
        Returns a :py:class:`miprometheus.utils.DataDict` object with keys created on the \
        problem data_definitions and empty values (None).

        :return: new :py:class:`miprometheus.utils.DataDict` object.

        """
        return DataDict({key: None for key in self.data_definitions.keys()})
コード例 #9
0
    def __getitem__(self, index):
        """
        Getter that returns an individual sample from the problem's associated dataset (that can be generated \
        `on-the-fly`, or retrieved from disk. It can also possibly be composed of several files.).

        .. note::

            **To be redefined in subclasses.**


        .. note::

            **The getter should return a DataDict: its keys should be defined by** ``self.data_definitions`` **keys.**

            This ensures consistency of the content of the :py:class:`miprometheus.utils.DataDict` when processing \
            to the `handshake` between the :py:class:`miprometheus.problems.Problem` class and the \
            :py:class:`miprometheus.models.Model` class. For more information, please see\
             :py:func:`miprometheus.models.Model.handshake_definitions`.

            e.g.:

                >>> data_dict = DataDict({key: None for key in self.data_definitions.keys()})
                >>> # you can now access each value by its key and assign the corresponding object (e.g. `torch.tensor` etc)
                >>> ...
                >>> return data_dict



        .. warning::

            `Mi-Prometheus` supports multiprocessing for data loading (through the use of\
             :py:class:`torch.utils.data.DataLoader`).

            To construct a batch (say 64 samples), the indexes are distributed among several workers (say 4, so that
            each worker has 16 samples to retrieve). It is best that samples can be accessed individually in the dataset
            folder so that there is no mutual exclusion between the workers and the performance is not degraded.

            If each sample is generated `on-the-fly`, this shouldn't cause a problem. There may be an issue with \
            randomness. Please refer to the official PyTorch documentation for this.


        :param index: index of the sample to return.
        :type index: int

        :return: Empty ``DataDict``, having the same key as ``self.data_definitions``.

        """
        return DataDict({key: None for key in self.data_definitions.keys()})
コード例 #10
0
    def collate_fn(self, batch):
        """
        Combines a list of DataDict (retrieved with ``__getitem__``) into a batch.

        .. note::

            Because each tokenized question has a variable length, padding is necessary to create batches.

            Hence, for a given batch, each question is padded to the length of the longest one.

            This length changes between batches, but this shouldn't be an issue.


        :param batch: list of individual samples to combine
        :type batch: list

        :return: DataDict({'images','questions', 'questions_length', 'questions_string', 'questions_type', 'targets', \
        'targets_string', 'index','imgfiles'})

        """
        batch_size = len(batch)

        # get max question length, create tensor of shape [batch_size x maxQuestionLength] & sort questions by
        # decreasing length
        max_len = max(map(lambda x: x['questions_length'], batch))
        sort_by_len = sorted(batch, key=lambda x: x['questions_length'], reverse=True)

        # create tensor containing the embedded questions
        questions = torch.zeros(batch_size, max_len, self.embedding_dim).type(torch.FloatTensor)

        # construct the DataDict and fill it with the batch
        data_dict = DataDict({key: None for key in self.data_definitions.keys()})

        data_dict['images'] = torch.stack([elt['images'] for elt in sort_by_len]).type(torch.FloatTensor)
        data_dict['questions_length'] = [elt['questions_length'] for elt in sort_by_len]
        data_dict['targets'] = torch.tensor([elt['targets'] for elt in sort_by_len]).type(torch.LongTensor)
        data_dict['questions_string'] = [elt['questions_string'] for elt in sort_by_len]
        data_dict['index'] = [elt['index'] for elt in sort_by_len]
        data_dict['imgfiles'] = [elt['imgfiles'] for elt in sort_by_len]
        data_dict['questions_type'] = [elt['questions_type'] for elt in sort_by_len]

        for i, length in enumerate(data_dict['questions_length']):  # only way to do this?
            questions[i, :length, :] = sort_by_len[i]['questions']

        data_dict['questions'] = questions

        return data_dict
コード例 #11
0
    def collate_fn(self, batch):
        """
        Combines a list of ``DataDict`` (retrieved with ``__getitem__`` ) into a batch.

        .. note::

            This function wraps a call to ``default_collate`` and simply returns the batch as a ``DataDict``\
            instead of a dict.

            Multi-processing is supported as the data sources are small enough to be kept in memory\
            (`self.root-dir/cifar-10-batches/data_batch_i` have a size of 31.0 MB).

        :param batch: list of individual ``DataDict`` samples to combine.

        :return: ``DataDict({'images','targets', 'targets_label'})`` containing the batch.

        """

        return DataDict({key: value for key, value in zip(self.data_definitions.keys(),
                                                          super(CIFAR10, self).collate_fn(batch).values())})
コード例 #12
0
    def collate_fn(self, batch):
        """
        Combines a list of ``DataDict`` (retrieved with ``__getitem__`` ) into a batch.

        .. note::

            This function wraps a call to ``default_collate`` and simply returns the batch as a ``DataDict``\
            instead of a dict.

        :param batch: list of individual ``DataDict`` samples to combine.

        :return: ``DataDict({'images','questions', 'targets', 'targets_index', 'scenes_description'})`` containing the batch.

        """

        return DataDict({
            key: value
            for key, value in zip(
                self.data_definitions.keys(),
                super(SortOfCLEVR, self).collate_fn(batch).values())
        })
コード例 #13
0
    def collate_fn(self, batch):
        """
        Combines a list of ``DataDict`` (retrieved with ``__getitem__`` ) into a batch.

        .. note::

            This function wraps a call to ``default_collate`` and simply returns the batch as a ``DataDict``\
            instead of a dict.
            Multi-processing is supported as the data sources are small enough to be kept in memory\
            (`training.pt` has a size of 47.5 MB).

        :param batch: list of individual ``DataDict`` samples to combine.

        :return: ``DataDict({'sequences','targets', 'targets_label'})`` containing the batch.

        """

        return DataDict({
            key: value
            for key, value in zip(
                self.data_definitions.keys(),
                super(SequentialPixelMNIST, self).collate_fn(batch).values())
        })
コード例 #14
0
    def __getitem__(self, index):
        """
        Getter method to access the dataset and return a sample.

        .. warning::

            **HDF5 does not support multi threaded data access with num_workers > 1 on the data loading.**
            A way around this is to move every call for opening the HDF5 file to this ``__getitem__`` method.

            See https://discuss.pytorch.org/t/hdf5-multi-threaded-alternative/6189/9 for more info.

        :param index: index of the sample to return.

        :return: DataDict({'images','questions', 'targets', 'targets_index', 'scenes_description'}), with:

            - images: images (``self.img_size``)
            - questions: encoded questions
            - targets: one-hot encoded answers
            - targets_index: index of the answers
            - scenes_description: Scene description.

        """
        # load the file
        data = h5py.File(self.filename, 'r')
        sample = data[str(index)]

        data_dict = DataDict(
            {key: None
             for key in self.data_definitions.keys()})
        data_dict['images'] = (sample['image'].value / 255).transpose(2, 1, 0)
        data_dict['questions'] = sample['question'].value.astype(np.float32)
        data_dict['targets_classes'] = sample['answer'].value.astype(
            np.float32)
        data_dict['targets'] = np.argmax(data_dict['targets_classes'])
        data_dict['scenes_description'] = sample['scene_description'].value

        return data_dict
コード例 #15
0
    def __getitem__(self, index):
        """
        Retrieves a sample from ``self.tensor_pairs`` and get the associated strings from ``self.pairs``.


        :param index: index of the sample to return.
        :type index: int

        :return: DataDict({'inputs', 'inputs_length', 'inputs_text' 'targets', 'targets_length', 'targets_text'}).

        """
        # get tensors and strings
        input_tensor, target_tensor = self.tensor_pairs[index]
        input_text, target_text = self.pairs[index]

        # embed the input sentence:
        input_tensor = self.input_embed_layer(
            torch.LongTensor(input_tensor)).type(torch.FloatTensor)

        # embed the output sentence:
        target_tensor = self.output_embed_layer(
            torch.LongTensor(target_tensor)).type(torch.FloatTensor)

        # return data_dict
        data_dict = DataDict(
            {key: None
             for key in self.data_definitions.keys()})
        data_dict['inputs'] = input_tensor
        data_dict['inputs_length'] = len(input_tensor)
        data_dict['inputs_text'] = input_text

        data_dict['targets'] = target_tensor
        data_dict['targets_length'] = len(target_tensor)
        data_dict['targets_text'] = target_text

        return data_dict
コード例 #16
0
        'store_bit': 0,
        'recall_bit': 1
    }
    input_size = problem_default_values['input_item_size']
    output_size = problem_default_values['output_item_size']

    # Construct our model by instantiating the class defined above
    model = NTM(params, problem_default_values)

    # Check for different seq_lengts and batch_sizes.
    for i in range(2):
        # Create random Tensors to hold inputs and outputs
        x = torch.randn(batch_size, seq_length, input_size)
        y = torch.randn(batch_size, seq_length, output_size)

        dt = DataDict({'sequences': x, 'targets': y})

        # Test forward pass.
        logger.info("------- forward -------")
        y_pred = model(dt)

        logger.info("------- result -------")
        logger.info("input {}:\n {}".format(x.size(), x))
        logger.info("target.size():\n {}".format(y.size()))
        logger.info("prediction {}:\n {}".format(y_pred.size(), y_pred))

        # Plot it and check whether window was closed or not.
        if model.plot(dt, y_pred):
            break

        # Change batch size and seq_length.
コード例 #17
0
    def __getitem__(self, index):
        """
        Getter that returns one individual sample generated on-the-fly

        .. note::

            The sequence length is drawn randomly between ``self.min_sequence_length`` and \
            ``self.max_sequence_length``.


        :param index: index of the sample to return.

        :return: DataDict({'sequences', 'sequences_length', 'targets', 'mask', 'num_subsequences'}), with:

            - sequences: [SEQ_LENGTH, CONTROL_BITS+DATA_BITS]. SEQ_LENGTH depends on number of sub-sequences
                and its lengths.

            - **sequences_length: random value between self.min_sequence_length and self.max_sequence_length**
            - targets: [SEQ_LENGTH, DATA_BITS],
            - mask: [SEQ_LENGTH]
            - num_subsequences: number of subsequences


        """
        # TODO: THE DOCUMENTATION NEEDS TO BE UPDATED
        # TODO: This is commented for now to avoid the issue with `add_ctrl` and `augment` in AlgorithmicSeqToSeqProblem
        # TODO: NOT SURE THAT THIS FN IS WORKING WELL (WITHOUT THE PRESENCE OF THE BATCH DIMENSION)
        '''
        # define control channel markers
        pos = [0, 0]
        ctrl_data = [0, 0]
        ctrl_dummy = [0, 1]
        ctrl_inter = [0, 1]

        # assign markers
        markers = ctrl_data, ctrl_dummy, pos

        # number sub sequences
        num_sub_seq = np.random.randint( self.num_subseq_min, self.num_subseq_max + 1)

        # set the sequence length of each marker
        seq_length = np.random.randint(low=self.min_sequence_length, high=self.max_sequence_length + 1, 
                                       size=num_sub_seq)

        #  generate subsequences for x and y
        x = [np.random.binomial(1, self.bias, (n, self.data_bits)) for n in seq_length]
        x_last = [a[None, -1, :] for a in x]

        # create the target
        seq_length_tdummies = sum(seq_length) + seq_length.shape[0] + 1
        
        dummies_target = np.zeros([seq_length_tdummies, self.data_bits], dtype=np.float32)
        targets = np.concatenate([dummies_target] + x_last, axis=0)

        # data of x and dummies
        xx = [self.augment(seq, markers, ctrl_start=[1, 0], add_marker_data=True, add_marker_dummy=False) for seq in x]

        # data of x
        data_1 = [arr for a in xx for arr in a[:-1]]

        # this is a marker between sub sequence x and dummies
        inter_seq = self.add_ctrl(np.zeros((1, self.data_bits)), ctrl_inter, pos)

        # dummies of x
        x_dummy_last = [a[None, -1, :] for b in xx for a in b[-1:]]

        # concatenate all parts of the inputs
        inputs = np.concatenate(data_1 + [inter_seq] + x_dummy_last, axis=0)

        # PyTorch variables
        inputs = torch.from_numpy(inputs).type(self.app_state.dtype)
        targets = torch.from_numpy(targets).type(self.app_state.dtype)
        
        # TODO: batch might have different sequence lengths
        mask_all = inputs[..., 0:self.control_bits] == 1
        mask = mask_all[..., 0]
        for i in range(self.control_bits):
            mask = mask_all[..., i] * mask

        # TODO: fix the batch indexing
        # rest channel values of data dummies
        inputs[mask[0], 0:self.control_bits] = 0

        # Return data_dict.
        data_dict = DataDict({key: None for key in self.data_definitions.keys()})
        data_dict['sequences'] = inputs
        data_dict['sequences_length'] = max(seq_length)
        data_dict['targets'] = targets
        data_dict['mask'] = mask
        data_dict['num_subsequences'] = num_sub_seq
        '''

        return DataDict({key: None for key in self.data_definitions.keys()}) #data_dict
コード例 #18
0
    def collate_fn(self, batch):
        """
        Generates a batch of samples on-the-fly

        .. warning::
            Because of the fact that the sequence length is randomly drawn between ``self.min_sequence_length`` and \
            ``self.max_sequence_length`` and then fixed for one given batch (**but varies between batches**), \
            we cannot follow the scheme `merge together individuals samples that can be retrieved in parallel with\
            several workers.` Indeed, each sample could have a different sequence length, and merging them together\
            would then not be possible (we cannot have variable-sequence-length samples within one batch \
            without padding).
            Hence, ``collate_fn`` generates on-the-fly a batch of samples, all having the same length (initially\
            randomly selected).
            The samples created by ``__getitem__`` are simply not used in this function.


        :param batch: Should be a list of DataDict retrieved by `__getitem__`, each containing tensors, numbers,\
        dicts or lists. --> **Not Used Here!**

        :return: DataDict({'sequences', 'sequences_length', 'targets', 'mask', 'num_subsequences'}), with:

            - sequences: [BATCH_SIZE, SEQ_LENGTH, CONTROL_BITS+DATA_BITS],
            - **sequences_length: random value between self.min_sequence_length and self.max_sequence_length**
            - targets: [BATCH_SIZE, SEQ_LENGTH, DATA_BITS],
            - mask: [BATCH_SIZE, SEQ_LENGTH]
            - num_subsequences: number of subsequences

        # TODO: THE DOCUMENTATION NEEDS TO BE UPDATED
        """

        # get the batch_size
        batch_size = len(batch)

        # define control channel markers
        pos = [0, 0]
        ctrl_data = [0, 0]
        ctrl_dummy = [0, 1]
        ctrl_inter = [0, 1]

        # assign markers
        markers = ctrl_data, ctrl_dummy, pos

        # number sub sequences
        num_sub_seq = np.random.randint(self.num_subseq_min, self.num_subseq_max + 1)

        # set the sequence length of each marker
        seq_length = np.random.randint(low=self.min_sequence_length, high=self.max_sequence_length + 1,
                                       size=num_sub_seq)

        #  generate subsequences for x and y
        x = [np.random.binomial(1, self.bias, (batch_size, n, self.data_bits)) for n in seq_length]
        x_last = [a[:, None, -1, :] for a in x]

        # create the target
        seq_length_tdummies = sum(seq_length) + seq_length.shape[0] + 1
        dummies_target = np.zeros([batch_size, seq_length_tdummies, self.data_bits], dtype=np.float32)
        targets = np.concatenate([dummies_target] + x_last, axis=1)

        # data of x and dummies
        xx = [self.augment(seq, markers, ctrl_start=[1, 0], add_marker_data=True, add_marker_dummy=False) for seq in x]

        # data of x
        data_1 = [arr for a in xx for arr in a[:-1]]

        # this is a marker between sub sequence x and dummies
        inter_seq = self.add_ctrl(np.zeros((batch_size, 1, self.data_bits)), ctrl_inter, pos)

        # dummies of x
        x_dummy_last = [a[:, None, -1, :] for b in xx for a in b[-1:]]

        # concatenate all parts of the inputs
        inputs = np.concatenate(data_1 + [inter_seq] + x_dummy_last, axis=1)

        # PyTorch variables
        inputs = torch.from_numpy(inputs).type(self.app_state.dtype)
        targets = torch.from_numpy(targets).type(self.app_state.dtype)

        # TODO: batch might have different sequence lengths
        mask_all = inputs[..., 0:self.control_bits] == 1
        mask = mask_all[..., 0]
        for i in range(self.control_bits):
            mask = mask_all[..., i] * mask

        # TODO: fix the batch indexing
        # rest channel values of data dummies
        inputs[:, mask[0], 0:self.control_bits] = 0

        # Return data_dict.
        data_dict = DataDict({key: None for key in self.data_definitions.keys()})
        data_dict['sequences'] = inputs
        data_dict['sequences_length'] = max(seq_length)
        data_dict['targets'] = targets
        data_dict['mask'] = mask
        data_dict['num_subsequences'] = num_sub_seq

        return data_dict
コード例 #19
0
    def collate_fn(self, batch):
        """
        Generates a batch of samples on-the-fly

        .. warning::
            Because of the fact that the sequence length is randomly drawn between ``self.min_sequence_length`` and \
            ``self.max_sequence_length`` and then fixed for one given batch (**but varies between batches**), \
            we cannot follow the scheme `merge together individuals samples that can be retrieved in parallel with\
            several workers.` Indeed, each sample could have a different sequence length, and merging them together\
            would then not be possible (we cannot have variable-sequence-length samples within one batch \
            without padding).
            Hence, ``collate_fn`` generates on-the-fly a batch of samples, all having the same length (initially\
            randomly selected).
            The samples created by ``__getitem__`` are simply not used in this function.


        :param batch: Should be a list of DataDict retrieved by `__getitem__`, each containing tensors, numbers,\
        dicts or lists. --> **Not Used Here!**

        :return: DataDict({'sequences', 'sequences_length', 'targets', 'mask', 'num_subsequences'}), with:

            - sequences: [BATCH_SIZE, 2*SEQ_LENGTH+2, CONTROL_BITS+DATA_BITS],
            - **sequences_length: random value between self.min_sequence_length and self.max_sequence_length**
            - targets: [BATCH_SIZE, 2*SEQ_LENGTH+2, DATA_BITS],
            - mask: [BATCH_SIZE, [2*SEQ_LENGTH+2]
            - num_subsequences: 1

        """
        # get the batch_size
        batch_size = len(batch)

        # Set sequence length.
        seq_length = np.random.randint(self.min_sequence_length,
                                       self.max_sequence_length + 1)

        # Generate batch of random bit sequences [BATCH_SIZE x SEQ_LENGTH X
        # DATA_BITS]
        bit_seq = np.random.binomial(1, self.bias,
                                     (batch_size, seq_length, self.data_bits))

        # Generate input:  [BATCH_SIZE, 2*SEQ_LENGTH+2, CONTROL_BITS+DATA_BITS]
        inputs = np.zeros([
            batch_size, 2 * seq_length + 2, self.control_bits + self.data_bits
        ],
                          dtype=np.float32)

        # Set start control marker.
        inputs[:, 0, 0] = 1  # Memorization bit.

        # Set bit sequence.
        inputs[:, 1:seq_length + 1,
               self.control_bits:self.control_bits + self.data_bits] = bit_seq

        # Set end control marker.
        inputs[:, seq_length + 1, 1] = 1  # Recall bit.

        # Generate target:  [BATCH_SIZE, 2*SEQ_LENGTH+2, DATA_BITS] (only data
        # bits!)
        targets = np.zeros([batch_size, 2 * seq_length + 2, self.data_bits],
                           dtype=np.float32)
        # Set bit sequence.

        # Rotate sequence by shifting the items to right: seq >> num_items
        # i.e num_items = 2 -> seq_items >> 2
        # and num_items = -1 -> seq_items << 1
        # For that reason we must change the sign of num_items
        num_items = -self.num_items

        # Check if we are using relative or absolute rotation.
        if -1 < num_items < 1:
            num_items = num_items * seq_length

        # Round items shift  to int.
        num_items = np.round(num_items)

        # Modulo items shift with length of the sequence.
        num_items = int(num_items % seq_length)

        # Apply items shift
        bit_seq = np.concatenate(
            (bit_seq[:, num_items:, :], bit_seq[:, :num_items, :]), axis=1)
        targets[:, seq_length + 2:, :] = bit_seq

        # Generate target mask: [BATCH_SIZE, 2*SEQ_LENGTH+2]
        mask = torch.zeros([batch_size, 2 * seq_length + 2
                            ]).type(self.app_state.ByteTensor)
        mask[:, seq_length + 2:] = 1

        # PyTorch variables.
        ptinputs = torch.from_numpy(inputs).type(self.app_state.dtype)
        pttargets = torch.from_numpy(targets).type(self.app_state.dtype)

        # Return data_dict.
        data_dict = DataDict(
            {key: None
             for key in self.data_definitions.keys()})
        data_dict['sequences'] = ptinputs
        data_dict['sequences_length'] = seq_length
        data_dict['targets'] = pttargets
        data_dict['mask'] = mask
        data_dict['num_subsequences'] = 1

        return data_dict
コード例 #20
0
    def __getitem__(self, index):
        """
        Getter that returns one individual sample generated on-the-fly

        .. note::

            The sequence length is drawn randomly between ``self.min_sequence_length`` and \
            ``self.max_sequence_length``.


        :param index: index of the sample to return.

        :return: DataDict({'sequences', 'sequences_length', 'targets', 'mask', 'num_subsequences'}), with:

            - sequences: [2*SEQ_LENGTH+2, CONTROL_BITS+DATA_BITS],
            - **sequences_length: random value between self.min_sequence_length and self.max_sequence_length**
            - targets: [2*SEQ_LENGTH+2, DATA_BITS],
            - mask: [2*SEQ_LENGTH+2]
            - num_subsequences: 1


        """
        # Set sequence length.
        seq_length = np.random.randint(self.min_sequence_length,
                                       self.max_sequence_length + 1)

        # Generate batch of random bit sequences [SEQ_LENGTH X DATA_BITS]
        bit_seq = np.random.binomial(1, self.bias,
                                     (seq_length, self.data_bits))

        # Generate input:  [2*SEQ_LENGTH+2, CONTROL_BITS+DATA_BITS]
        inputs = np.zeros(
            [2 * seq_length + 2, self.control_bits + self.data_bits],
            dtype=np.float32)

        # Set start control marker.
        inputs[0, 0] = 1  # Memorization bit.

        # Set bit sequence.
        inputs[1:seq_length + 1,
               self.control_bits:self.control_bits + self.data_bits] = bit_seq

        # Set end control marker.
        inputs[seq_length + 1, 1] = 1  # Recall bit.

        # Generate target:  [2*SEQ_LENGTH+2, DATA_BITS] (only data bits!)
        targets = np.zeros([2 * seq_length + 2, self.data_bits],
                           dtype=np.float32)

        # Set bit sequence.

        # Rotate sequence by shifting the items to right: seq >> num_items
        # i.e num_items = 2 -> seq_items >> 2
        # and num_items = -1 -> seq_items << 1
        # For that reason we must change the sign of num_items
        num_items = -self.num_items

        # Check if we are using relative or absolute rotation.
        if -1 < num_items < 1:
            num_items = num_items * seq_length

        # Round items shift  to int.
        num_items = np.round(num_items)

        # Modulo items shift with length of the sequence.
        num_items = int(num_items % seq_length)

        # Apply items shift
        bit_seq = np.concatenate(
            (bit_seq[num_items:, :], bit_seq[:num_items, :]), axis=0)
        targets[seq_length + 2:, :] = bit_seq

        # Generate target mask: [2*SEQ_LENGTH+2]
        mask = torch.zeros([2 * seq_length + 2
                            ]).type(self.app_state.ByteTensor)
        mask[seq_length + 2:] = 1

        # PyTorch variables.
        ptinputs = torch.from_numpy(inputs).type(self.app_state.dtype)
        pttargets = torch.from_numpy(targets).type(self.app_state.dtype)

        # Return data_dict.
        data_dict = DataDict(
            {key: None
             for key in self.data_definitions.keys()})
        data_dict['sequences'] = ptinputs
        data_dict['sequences_length'] = seq_length
        data_dict['targets'] = pttargets
        data_dict['mask'] = mask
        data_dict['num_subsequences'] = 1

        return data_dict
コード例 #21
0
        file_folder_to_check = os.path.expanduser(file_folder_to_check)
        if not (os.path.isfile(file_folder_to_check) or os.path.isdir(file_folder_to_check)):
            self.logger.info('Downloading {}'.format(url))
            urllib.request.urlretrieve(url, os.path.expanduser(download_name), reporthook)
            return True
        else:
            self.logger.info('Dataset found at {}'.format(file_folder_to_check))
            return False


if __name__ == '__main__':
    """Unit test for Problem and DataDict"""
    from miprometheus.utils.param_interface import ParamInterface

    params = ParamInterface()

    problem = Problem(params)
    problem.data_definitions = {'inputs': {'size': [-1, -1], 'type': [torch.Tensor]},
                                'targets': {'size': [-1], 'type': [torch.Tensor]}
                                }
    problem.loss_function = torch.nn.CrossEntropyLoss()  # torch.nn.L1Loss, torch.nn.TripletMarginLoss

    datadict = DataDict({key: None for key in problem.data_definitions.keys()})

    # datadict['inputs'] = torch.ones([64, 20, 512]).type(torch.FloatTensor)
    # datadict['targets'] = torch.ones([64, 20]).type(torch.FloatTensor)

    # print(repr(datadict))

コード例 #22
0
    def collate_fn(self, batch):
        """
        Combines a list of DataDict (retrieved with ``__getitem__``) into a batch.

        .. note::

            Because each tokenized sentence has a variable length, padding is necessary to create batches.

            Hence, for a given batch, each sentence is padded to the length of the longest one.

            **The batch is sorted decreasingly as a function of the input sentences length.**

            This length changes between batches, but this shouldn't be an issue.


        :param batch: Individual samples to combine
        :type batch: list

        :return: ``DataDict({'inputs', 'inputs_length', 'inputs_text' 'targets', 'targets_length', 'targets_text'})``\
        containing the batch.

        """
        batch_size = len(batch)

        # get max input sentence length, create tensor of shape [batch_size x max_input_length] & sort inputs by
        # decreasing length
        max_input_len = max(map(lambda x: x['inputs_length'], batch))
        sort_by_len = sorted(batch,
                             key=lambda x: x['inputs_length'],
                             reverse=True)

        # create tensor containing the embedded input sentences
        inputs = torch.zeros(batch_size, max_input_len,
                             self.embedding_dim).type(torch.FloatTensor)

        # get max output sentence length
        max_output_len = max(map(lambda x: x['targets_length'], batch))
        # create tensor containing the embedded output sentences
        outputs = torch.zeros(batch_size, max_output_len,
                              self.embedding_dim).type(torch.FloatTensor)

        # construct the DataDict and fill it with the batch
        data_dict = DataDict(
            {key: None
             for key in self.data_definitions.keys()})

        data_dict['inputs_length'] = [
            elt['inputs_length'] for elt in sort_by_len
        ]
        data_dict['inputs_text'] = [elt['inputs_text'] for elt in sort_by_len]

        data_dict['targets_length'] = [
            elt['targets_length'] for elt in sort_by_len
        ]
        data_dict['targets_text'] = [
            elt['targets_text'] for elt in sort_by_len
        ]

        for i, length in enumerate(
                data_dict['inputs_length']):  # only way to do this?
            inputs[i, :length, :] = sort_by_len[i]['inputs']
            outputs[i, :data_dict['targets_length'][i], :] = sort_by_len[i][
                'targets']

        data_dict['inputs'] = inputs
        data_dict['targets'] = outputs

        return data_dict
コード例 #23
0
    def __getitem__(self, index):
        """
        Getter that returns one individual sample generated on-the-fly

        .. note::

            The sequence length is drawn randomly between ``self.min_sequence_length`` and \
            ``self.max_sequence_length``.


        :param index: index of the sample to return.

        :return: DataDict({'sequences', 'sequences_length', 'targets', 'mask', 'num_subsequences'}), with:

            - sequences: [2*SEQ_LENGTH+2, CONTROL_BITS+DATA_BITS],
            - **sequences_length: random value between self.min_sequence_length and self.max_sequence_length**
            - targets: [2*SEQ_LENGTH+2, DATA_BITS],
            - mask: [2*SEQ_LENGTH+2]
            - num_subsequences: 1


        """
        # Set sequence length
        seq_length = np.random.randint(self.min_sequence_length,
                                       self.max_sequence_length + 1)

        # Generate batch of random bit sequences [SEQ_LENGTH X
        # DATA_BITS]
        bit_seq = np.random.binomial(1, self.bias,
                                     (seq_length, self.data_bits))

        # Generate input:  [2*SEQ_LENGTH+2, CONTROL_BITS+DATA_BITS]
        inputs = np.zeros(
            [2 * seq_length + 2, self.control_bits + self.data_bits],
            dtype=np.float32)

        # Set start control marker.
        inputs[0, 0] = 1  # Memorization bit.

        # Set bit sequence.
        inputs[1:seq_length + 1,
               self.control_bits:self.control_bits + self.data_bits] = bit_seq

        # Set end control marker.
        inputs[seq_length + 1, 1] = 1  # Recall bit.

        # Generate target:  [2*SEQ_LENGTH+2, DATA_BITS] (only data
        # bits!)
        targets = np.zeros([2 * seq_length + 2, self.data_bits],
                           dtype=np.float32)

        # Set target bit sequence - logical not.
        targets[seq_length + 2:, :] = np.logical_not(bit_seq)

        # Generate target mask: [2*SEQ_LENGTH+2]
        mask = torch.zeros([2 * seq_length + 2
                            ]).type(self.app_state.ByteTensor)
        mask[seq_length + 2:] = 1

        # PyTorch variables.
        ptinputs = torch.from_numpy(inputs).type(self.app_state.dtype)
        pttargets = torch.from_numpy(targets).type(self.app_state.dtype)

        # Return data_dict.
        data_dict = DataDict(
            {key: None
             for key in self.data_definitions.keys()})
        data_dict['sequences'] = ptinputs
        data_dict['sequences_length'] = seq_length
        data_dict['targets'] = pttargets
        data_dict['mask'] = mask
        data_dict['num_subsequences'] = 1

        return data_dict
コード例 #24
0
    def collate_fn(self, batch):
        """
        Generates a batch of samples on-the-fly

        .. warning::
            Because of the fact that the sequence length is randomly drawn between ``self.min_sequence_length`` and \
            ``self.max_sequence_length`` and then fixed for one given batch (**but varies between batches**), \
            we cannot follow the scheme `merge together individuals samples that can be retrieved in parallel with\
            several workers.` Indeed, each sample could have a different sequence length, and merging them together\
            would then not be possible (we cannot have variable-sequence-length samples within one batch \
            without padding).
            Hence, ``collate_fn`` generates on-the-fly a batch of samples, all having the same length (initially\
            randomly selected).
            The samples created by ``__getitem__`` are simply not used in this function.


        :param batch: Should be a list of DataDict retrieved by `__getitem__`, each containing tensors, numbers,\
        dicts or lists. --> **Not Used Here!**

        :return: DataDict({'sequences', 'sequences_length', 'targets', 'mask', 'num_subsequences'}), with:

            - sequences: [BATCH_SIZE, 2*SEQ_LENGTH+2, CONTROL_BITS+DATA_BITS],
            - **sequences_length: random value between self.min_sequence_length and self.max_sequence_length**
            - targets: [BATCH_SIZE, 2*SEQ_LENGTH+2, DATA_BITS],
            - mask: [BATCH_SIZE, [2*SEQ_LENGTH+2]
            - num_subsequences: 1

       pattern of inputs: # x1 % y1 # x2 % y2 ... # xn % yn & d $ d`
       pattern of target: dummies ...   ...       ...   ...   yn  all(xi)
       mask: used to mask the data part of the target.
       xi, yi, and d(d'): sub sequences x of random length, sub sequence y of random length and dummies.


        # TODO: THE DOCUMENTATION NEEDS TO BE UPDATED & IMPROVED

        """
        # get the batch_size
        batch_size = len(batch)

        # define control channel markers
        pos = [0, 0, 0, 0]
        ctrl_data = [0, 0, 0, 0]
        ctrl_dummy = [0, 0, 1, 0]
        ctrl_inter = [0, 0, 0, 1]
        # assign markers
        markers = ctrl_data, ctrl_dummy, pos

        # number of sub_sequences
        nb_sub_seq_a = np.random.randint(self.num_subseq_min,
                                         self.num_subseq_max + 1)
        # might be different in future implementation
        nb_sub_seq_b = nb_sub_seq_a

        # set the sequence length of each marker
        seq_lengths_a = np.random.randint(low=self.min_sequence_length,
                                          high=self.max_sequence_length + 1,
                                          size=nb_sub_seq_a)
        seq_lengths_b = np.random.randint(low=self.min_sequence_length,
                                          high=self.max_sequence_length + 1,
                                          size=nb_sub_seq_b)

        #  generate subsequences for x and y
        x = [
            np.random.binomial(1, self.bias, (batch_size, n, self.data_bits))
            for n in seq_lengths_a
        ]
        y = [
            np.random.binomial(1, self.bias, (batch_size, n, self.data_bits))
            for n in seq_lengths_b
        ]

        # create the target
        target_wo_dummies = np.concatenate([y[-1]] + x, axis=1)

        # add marker at the begging of x and dummies
        xx = [
            self.augment(seq,
                         markers,
                         ctrl_start=[1, 0, 0, 0],
                         add_marker_data=True,
                         add_marker_dummy=False) for seq in x
        ]
        # add marker at the begging of y and dummies of same length, also a
        # marker at the begging of dummies is added
        yy = [
            self.augment(seq,
                         markers,
                         ctrl_start=[0, 1, 0, 0],
                         add_marker_data=True) for seq in y
        ]

        # this is a marker to separate dummies of x and y at the end of the
        # sequence
        inter_seq = self.add_ctrl(np.zeros((batch_size, 1, self.data_bits)),
                                  ctrl_inter, pos)

        # data which contains all xs and all ys
        data_1 = [arr for a, b in zip(xx, yy) for arr in a[:-1] + b[:-1]]

        # dummies of y and xs
        data_2 = [yy[-1][-1]] + [inter_seq] + [a[-1] for a in xx]

        # concatenate all parts of the inputs
        inputs = np.concatenate(data_1 + data_2, axis=1)

        # PyTorch variables
        inputs = torch.from_numpy(inputs).type(self.app_state.dtype)
        target_wo_dummies = torch.from_numpy(target_wo_dummies).type(
            self.app_state.dtype)

        # create the mask
        mask_all = inputs[:, :, 0:self.control_bits] == 1
        mask = mask_all[..., 0]
        for i in range(self.control_bits):
            mask = mask_all[..., i] * mask

        # rest ctrl channel of dummies
        inputs[:, mask[0], 0:self.control_bits] = 0

        # Create the target with the dummies
        targets = torch.zeros_like(inputs[:, :, self.control_bits:])
        targets[:, mask[0], :] = target_wo_dummies

        # Return data_dict.
        data_dict = DataDict(
            {key: None
             for key in self.data_definitions.keys()})
        data_dict['sequences'] = inputs
        data_dict['sequences_length'] = max(seq_lengths_a)
        data_dict['targets'] = targets
        data_dict['mask'] = mask
        data_dict['num_subsequences'] = nb_sub_seq_a + nb_sub_seq_b

        return data_dict
コード例 #25
0
    def __getitem__(self, index):
        """
        Getter that returns one individual sample generated on-the-fly

        .. note::

            The sequence length is drawn randomly between ``self.min_sequence_length`` and \
            ``self.max_sequence_length``.


        :param index: index of the sample to return.

        :return: DataDict({'sequences', 'sequences_length', 'targets', 'mask', 'num_subsequences'}), with:

            - sequences: [SEQ_LENGTH, CONTROL_BITS+DATA_BITS],
            - **sequences_length: random value between self.min_sequence_length and self.max_sequence_length**
            - targets: [SEQ_LENGTH, DATA_BITS],
            - mask: [SEQ_LENGTH]
            - num_subsequences: 1

            pattern of inputs: # x1 % y1 # x2 % y2 ... # xn % yn & d $ d`
            pattern of target: dummies ...   ...       ...   ...   yn  all(xi)
            mask: used to mask the data part of the target.
            xi, yi, and d(d'): sub sequences x of random length, sub sequence y of random length and dummies.

        # TODO: THE DOCUMENTATION NEEDS TO BE UPDATED
        # TODO: This is commented for now to avoid the issue with `add_ctrl` and `augment` in AlgorithmicSeqToSeqProblem
        # TODO: NOT SURE THAT THIS FN IS WORKING WELL (WITHOUT THE PRESENCE OF THE BATCH DIMENSION)

        """
        '''
        # define control channel markers
        pos = [0, 0, 0, 0]
        ctrl_data = [0, 0, 0, 0]
        ctrl_dummy = [0, 0, 1, 0]
        ctrl_inter = [0, 0, 0, 1]
        # assign markers
        markers = ctrl_data, ctrl_dummy, pos

        # number of sub_sequences
        nb_sub_seq_a = np.random.randint(
            self.num_subseq_min, self.num_subseq_max + 1)
        # might be different in future implementation
        nb_sub_seq_b = nb_sub_seq_a

        # set the sequence length of each marker
        seq_lengths_a = np.random.randint(
            low=self.min_sequence_length,
            high=self.max_sequence_length + 1,
            size=nb_sub_seq_a)
        seq_lengths_b = np.random.randint(
            low=self.min_sequence_length,
            high=self.max_sequence_length + 1,
            size=nb_sub_seq_b)

        #  generate subsequences for x and y
        x = [
            np.random.binomial(
                1,
                self.bias,
                (n,
                 self.data_bits)) for n in seq_lengths_a]
        y = [
            np.random.binomial(
                1,
                self.bias,
                (n,
                 self.data_bits)) for n in seq_lengths_b]

        # create the target
        target_wo_dummies = np.concatenate([y[-1]] + x, axis=0)

        # add marker at the begging of x and dummies
        xx = [
            self.augment(
                seq,
                markers,
                ctrl_start=[
                    1,
                    0,
                    0,
                    0],
                add_marker_data=True,
                add_marker_dummy=False) for seq in x]
        # add marker at the begging of y and dummies of same length, also a
        # marker at the begging of dummies is added
        yy = [self.augment(seq, markers, ctrl_start=[
                           0, 1, 0, 0], add_marker_data=True) for seq in y]

        # this is a marker to separate dummies of x and y at the end of the
        # sequence
        inter_seq = self.add_ctrl(
            np.zeros((1, self.data_bits)), ctrl_inter, pos)

        # data which contains all xs and all ys
        data_1 = [arr for a, b in zip(xx, yy) for arr in a[:-1] + b[:-1]]

        # dummies of y and xs
        data_2 = [yy[-1][-1]] + [inter_seq] + [a[-1] for a in xx]

        # concatenate all parts of the inputs
        inputs = np.concatenate(data_1 + data_2, axis=0)

        # PyTorch variables
        inputs = torch.from_numpy(inputs).type(self.app_state.dtype)
        target_wo_dummies = torch.from_numpy(
            target_wo_dummies).type(self.app_state.dtype)

        # create the mask
        mask_all = inputs[:, 0:self.control_bits] == 1
        mask = mask_all[..., 0]
        for i in range(self.control_bits):
            mask = mask_all[..., i] * mask

        # rest ctrl channel of dummies
        inputs[mask[0], 0:self.control_bits] = 0

        # Create the target with the dummies
        targets = torch.zeros_like(inputs[:, self.control_bits:])
        targets[mask[0], :] = target_wo_dummies

        # Return data_dict.
        data_dict = DataDict({key: None for key in self.data_definitions.keys()})
        data_dict['sequences'] = inputs
        data_dict['sequences_length'] = max(seq_lengths_a)
        data_dict['targets'] = targets
        data_dict['mask'] = mask
        data_dict['num_subsequences'] = nb_sub_seq_a + nb_sub_seq_b
        '''

        return DataDict({key: None
                         for key in self.data_definitions.keys()})  # data_dict