Python match_tokenized_to_untokenized Examples

Programming Language: Python

Namespace/Package Name: utils

Method/Function: match_tokenized_to_untokenized

Examples at hotexamples.com: 6

Python match_tokenized_to_untokenized - 6 examples found. These are the top rated real world Python examples of utils.match_tokenized_to_untokenized extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def get_special_activations(self, iterator, language, transformation):
        """ Model predictions are generated by batch and apply different transformation to the input: the model
        take the whole sentence as input.
        For each word, words outside its context window are either:
            - shuffled
            - replaced by random words
            - replaced by words with same POS and relationship dependencies
        """
        hidden_states_activations = []
        attention_heads_activations = []
        # Here, a batch is juste a sentence because we cannot create batches of equal length due to the transformation
        batches, indexes = utils.batchify_sentences(
            iterator,
            self.config['number_of_sentence'],
            self.config['number_of_sentence_before'],
            self.pretrained_gpt2_model,
            past_context_size=self.config['attention_length_before'],
            transformation=transformation,
            vocabulary=self.config['tokens_vocabulary'],
            dictionary=self.config['pos_dictionary'],
            seed=self.config['seed'],
            max_length=self.config['max_length'])

        for index_batch, batch in enumerate(batches):
            batch = batch.strip()  # Remove trailing character

            batch = '<|endoftext|> ' + batch + ' <|endoftext|>'
            tokenized_text = self.tokenizer.tokenize(batch,
                                                     add_prefix_space=False)
            #print('Batch number: ', index_batch, ' - ' , batch)
            #print(tokenized_text)
            #print('indexes:', indexes[index_batch], tokenized_text[indexes[index_batch][0]:indexes[index_batch][1]])
            #print()
            inputs_ids = torch.tensor(
                [self.tokenizer.convert_tokens_to_ids(tokenized_text)])

            mapping = utils.match_tokenized_to_untokenized(
                tokenized_text, batch)

            with torch.no_grad():
                encoded_layers = self.model(
                    inputs_ids
                )  # last_hidden_state, pooler_output, hidden_states, attentions

                if self.model.config.output_hidden_states:
                    hidden_states_activations_ = np.vstack(
                        encoded_layers[2]
                    )  # retrieve all the hidden states (dimension = layer_count * len(tokenized_text) * feature_count)
                    hidden_states_activations += utils.extract_activations_from_token_activations_special(
                        hidden_states_activations_, mapping,
                        indexes[index_batch]
                    )  #verify if we have to add 1 to indexes values

                if self.model.config.output_attentions:
                    raise NotImplementedError('Not yet implemented...')

        if self.model.config.output_hidden_states:
            hidden_states_activations = pd.DataFrame(
                np.vstack(hidden_states_activations),
                columns=[
                    'hidden_state-layer-{}-{}'.format(layer, index)
                    for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS)
                    for index in range(1, 1 + self.FEATURE_COUNT)
                ])

        if self.model.config.output_attentions:
            raise NotImplementedError('Not yet implemented...')

        return [hidden_states_activations, attention_heads_activations]

Example #2

Show file

    def get_token_level_activations(self, iterator, language):
        """ Model predictions are generated by batch with small attention masks: the model
        take the whole sentence as input.
        """
        hidden_states_activations = []
        attention_heads_activations = []
        # Here, we give as input the batch of line by batch of line.
        batches, indexes = utils.batchify_with_detailed_indexes(
            iterator,
            self.config['number_of_sentence'],
            self.config['number_of_sentence_before'],
            self.pretrained_gpt2_model,
            max_length=self.config['max_length'],
            add_prefix_space=self.add_prefix_space)
        indexes_tmp = [(indexes[i][-self.config['number_of_sentence']][0],
                        indexes[i][-1][1]) for i in range(len(indexes))]
        indexes_tmp[0] = (indexes[0][0][0], indexes[0][-1][1])
        # we ad 1 because of the initial special token
        for i in range(len(indexes_tmp)):
            indexes_tmp[i] = (indexes_tmp[i][0] + 1, indexes_tmp[i][1] + 1)

        for index_batch, batch in enumerate(batches):
            batch = batch.strip()  # Remove trailing character
            batch = '<|endoftext|> ' + batch + ' <|endoftext|>'

            tokenized_text = self.tokenizer.tokenize(batch,
                                                     add_prefix_space=False)
            mapping = utils.match_tokenized_to_untokenized(
                tokenized_text, batch)

            beg = indexes_tmp[index_batch][0]
            end = indexes_tmp[index_batch][1]

            inputs_ids = torch.tensor(
                [self.tokenizer.convert_tokens_to_ids(tokenized_text)])
            inputs_ids = torch.cat(inputs_ids.size(1) * [inputs_ids])
            inputs_ids = inputs_ids[beg:end, :]

            attention_mask = torch.diag_embed(
                torch.tensor([0 for x in tokenized_text]))
            for i in range(
                    min(len(tokenized_text), self.attention_length_before)):
                attention_mask = torch.add(
                    attention_mask,
                    torch.diag_embed(torch.tensor(
                        [1 for x in range(len(tokenized_text) - i)]),
                                     offset=-i))
            attention_mask = attention_mask[beg:end, :]

            with torch.no_grad():
                encoded_layers = self.model(
                    inputs_ids, attention_mask=attention_mask
                )  # last_hidden_state, pooler_output, hidden_states, attentions
                # last_hidden_state dimension: (batch_size, sequence_length, hidden_size)
                # pooler_output dimension: (batch_size, hidden_size)
                # hidden_states dimension: num_layers * (batch_size, sequence_length, hidden_size)
                # attentions dimension: num_layers * (batch_size, num_heads, sequence_length, sequence_length)
                # hacked version: attentions dimension: num_layers * [(batch_size, sequence_length, hidden_size),
                #                                                       (batch_size, num_heads, sequence_length, sequence_length)]
                # filtration
                if self.model.config.output_hidden_states:
                    hidden_states_activations_ = np.vstack([
                        torch.cat([
                            encoded_layers[2][layer]
                            [i,
                             len(tokenized_text) -
                             encoded_layers[2][layer].size(0) + i -
                             1, :].unsqueeze(0)
                            for i in range(encoded_layers[2][layer].size(0))
                        ],
                                  dim=0).unsqueeze(0).detach().numpy()
                        for layer in range(len(encoded_layers[2]))
                    ])
                    hidden_states_activations_ = np.concatenate([
                        np.zeros((hidden_states_activations_.shape[0],
                                  indexes_tmp[index_batch][0],
                                  hidden_states_activations_.shape[-1])),
                        hidden_states_activations_,
                        np.zeros(
                            (hidden_states_activations_.shape[0],
                             len(tokenized_text) - indexes_tmp[index_batch][1],
                             hidden_states_activations_.shape[-1]))
                    ],
                                                                axis=1)
                    # retrieve all the hidden states (dimension = layer_count * len(tokenized_text) * feature_count)
                    hidden_states_activations += utils.extract_activations_from_token_activations(
                        hidden_states_activations_, mapping,
                        indexes_tmp[index_batch])

                if self.model.config.output_attentions:
                    raise NotImplementedError('Not yet implemented...')
                    #attention_heads_activations_ = np.vstack([torch.cat([encoded_layers[-1][layer][0][i,:,i,:].unsqueeze(0) for i in range(len(tokenized_text))], dim=0).unsqueeze(0).detach().numpy() for layer in range(len(encoded_layers[-1]))])
                    #attention_heads_activations_ = np.swapaxes(attention_heads_activations_, 1, 2)
                    #attention_heads_activations += utils.extract_heads_activations_from_token_activations(attention_heads_activations_, mapping, indexes_tmp[index_batch])
        if self.model.config.output_hidden_states:
            hidden_states_activations = pd.DataFrame(
                np.vstack(hidden_states_activations),
                columns=[
                    'hidden_state-layer-{}-{}'.format(layer, index)
                    for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS)
                    for index in range(1, 1 + self.FEATURE_COUNT)
                ])
        if self.model.config.output_attentions:
            raise NotImplementedError('Not yet implemented...')
            #attention_heads_activations = pd.DataFrame(np.vstack(attention_heads_activations), columns=['attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)])
        return [hidden_states_activations, attention_heads_activations]

Example #3

Show file

    def get_classic_activations(self, iterator, language):
        """ Extract hidden state activations of the model for each token from the input, on a 
        word-by-word predictions or sentence-by-sentence prediction.
        Optionally includes surprisal and entropy.
        Input text should have one sentence per line, where each word and every 
        symbol is separated from the following by a space. No <eos> token should be included,
        as they are automatically integrated during tokenization.
        Arguments: 
            - iterator: iterator object, 
            generally: iterator = tokenize(path, language, self.vocab)
            - includ_surprisal: bool specifying if we include surprisal
            - includ_entropy: bool specifying if we include entropy
            - parameters: list (of string representing gate names)
        Returns:
            - result: pd.DataFrame containing activation (+ optionally entropy
            and surprisal)
        """
        hidden_states_activations = []
        attention_heads_activations = []
        # Here, we give as input the batch of line by batch of line.
        batches, indexes = utils.batchify_with_detailed_indexes(
            iterator,
            self.config['number_of_sentence'],
            self.config['number_of_sentence_before'],
            self.pretrained_gpt2_model,
            max_length=self.config['max_length'],
            stop_attention_at_sent=self.config['stop_attention_at_sent'],
            stop_attention_before_sent=self.
            config['stop_attention_before_sent'],
            add_prefix_space=self.add_prefix_space)
        indexes_tmp = [(indexes[i][-self.config['number_of_sentence']][0],
                        indexes[i][-1][1]) for i in range(len(indexes))]
        indexes_tmp[0] = (indexes[0][0][0], indexes[0][-1][1])

        for i in range(len(indexes_tmp)):
            indexes_tmp[i] = (indexes_tmp[i][0] + 1, indexes_tmp[i][1] + 1)

        for index, batch in enumerate(batches):
            batch = batch.strip()  # Remove trailing character
            batch = '<|endoftext|> ' + batch + ' <|endoftext|>'

            tokenized_text = self.tokenizer.tokenize(batch,
                                                     add_prefix_space=False)
            mapping = utils.match_tokenized_to_untokenized(
                tokenized_text, batch)
            inputs_ids = torch.tensor(
                [self.tokenizer.convert_tokens_to_ids(tokenized_text)])

            if self.prediction_type == 'sentence':
                attention_mask = torch.tensor([[1 for x in tokenized_text]])

                if (self.config['stop_attention_at_sent']
                        is not None) and (index > 0):
                    attention_mask[:, :indexes[index]
                                   [-self.config['stop_attention_at_sent'] -
                                    self.config['number_of_sentence']][0]] = 0
                    if self.config['stop_attention_before_sent'] < 0:
                        attention_mask[:, 1 + indexes[index][
                            -self.config['stop_attention_at_sent'] -
                            self.config['number_of_sentence']][0]:1 +
                                       indexes[index]
                                       [-self.config['stop_attention_at_sent']
                                        -
                                        self.config['number_of_sentence']][0] -
                                       self.
                                       config['stop_attention_before_sent']] = 0
                    elif self.config['stop_attention_before_sent'] > 0:
                        attention_mask[:, 1 + indexes[index][
                            -self.config['stop_attention_at_sent'] -
                            self.config['number_of_sentence']][0] - self.
                                       config['stop_attention_before_sent']:1 +
                                       indexes[index]
                                       [-self.config['stop_attention_at_sent']
                                        - self.
                                        config['number_of_sentence']][0]] = 1

            elif 'token-level' in self.prediction_type:
                attention_mask = torch.diag_embed(
                    torch.tensor([0 for x in tokenized_text]))
                for i in range(
                        min(len(tokenized_text),
                            self.attention_length_before)):
                    attention_mask = torch.add(
                        attention_mask,
                        torch.diag_embed(torch.tensor(
                            [1 for x in range(len(tokenized_text) - i)]),
                                         offset=-i))
                attention_mask = attention_mask.unsqueeze(0)
                if 'reverse' in self.prediction_type:
                    attention_mask = 1 - attention_mask

            with torch.no_grad():
                encoded_layers = self.model(
                    inputs_ids, attention_mask=attention_mask
                )  # last_hidden_state, pooler_output, hidden_states, attentions
                # last_hidden_state dimension: (batch_size, sequence_length, hidden_size)
                # pooler_output dimension: (batch_size, hidden_size)
                # hidden_states dimension: num_layers * (batch_size, sequence_length, hidden_size)
                # attentions dimension: num_layers * (batch_size, num_heads, sequence_length, sequence_length)
                # hacked version: attentions dimension: num_layers * [(batch_size, sequence_length, hidden_size),
                #                                                       (batch_size, num_heads, sequence_length, sequence_length)]
                # filtration
                if self.model.config.output_hidden_states:
                    hidden_states_activations_ = np.vstack(
                        encoded_layers[2]
                    )  # retrieve all the hidden states (dimension = layer_count * len(tokenized_text) * feature_count)
                    hidden_states_activations += utils.extract_activations_from_token_activations(
                        hidden_states_activations_, mapping,
                        indexes_tmp[index])
                if self.model.config.output_attentions:
                    attention_heads_activations_ = np.vstack(
                        [array[0] for array in encoded_layers[3]])
                    attention_heads_activations += utils.extract_heads_activations_from_token_activations(
                        attention_heads_activations_, mapping,
                        indexes_tmp[index])
        if self.model.config.output_hidden_states:
            hidden_states_activations = pd.DataFrame(
                np.vstack(hidden_states_activations),
                columns=[
                    'hidden_state-layer-{}-{}'.format(layer, index)
                    for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS)
                    for index in range(1, 1 + self.FEATURE_COUNT)
                ])
        if self.model.config.output_attentions:
            attention_heads_activations = pd.DataFrame(
                np.vstack(attention_heads_activations),
                columns=[
                    'attention-layer-{}-head-{}-{}'.format(layer, head, index)
                    for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS)
                    for head in range(1, 1 + self.NUM_ATTENTION_HEADS)
                    for index in range(
                        1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)
                ])
        return [hidden_states_activations, attention_heads_activations]

Example #4

Show file

File: model.py Project: AlexandrePsq/NLP_models

    def get_classic_activations(self, iterator, language):
        """ Model predictions are generated in the classical way: the model
        take the whole sentence as input.
        """
        hidden_states_activations = []
        attention_heads_activations = []
        cls_hidden_states_activations = []
        sep_hidden_states_activations = []
        cls_attention_activations = []
        sep_attention_activations = []
        # Here, we give as input the batch of line by batch of line.
        batches, indexes = utils.batchify_per_sentence_with_pre_and_post_context(
            iterator, 
            self.tokenizer, 
            self.config['number_of_sentence'], 
            self.config['number_of_sentence_before'], 
            self.config['number_of_sentence_after'], 
            max_length=self.config['max_length'])
            
        for index, batch in enumerate(batches):
            batch = batch.strip() # Remove trailing character

            batch = '[CLS] ' + batch + ' [SEP]'
            tokenized_text = self.tokenizer.wordpiece_tokenizer.tokenize(batch)
            inputs_ids = torch.tensor([self.tokenizer.convert_tokens_to_ids(tokenized_text)])
            attention_mask = torch.tensor([[1 for x in tokenized_text]])
            mapping = utils.match_tokenized_to_untokenized(tokenized_text, batch)

            with torch.no_grad():
                encoded_layers = self.model(inputs_ids, attention_mask=attention_mask) # last_hidden_state, pooler_output, hidden_states, attentions
                # last_hidden_state dimension: (batch_size, sequence_length, hidden_size)
                # hidden_states dimension: num_layers * (batch_size, sequence_length, hidden_size)
                # attentions dimension: num_layers * (batch_size, num_heads, sequence_length, sequence_length)
                # hacked version: attentions dimension: num_layers * [(batch_size, sequence_length, hidden_size), 
                #                                                       (batch_size, num_heads, sequence_length, sequence_length)]
                # filtration
                if self.model.config.output_hidden_states:
                    hidden_states_activations_ = np.vstack(encoded_layers[1]) # retrieve all the hidden states (dimension = layer_count * len(tokenized_text) * feature_count)
                    hidden_states_activations += utils.extract_activations_from_token_activations(hidden_states_activations_, mapping, indexes[index])
                    cls_activations_, sep_activations_ = utils.extract_activations_from_special_tokens(hidden_states_activations_, mapping)
                    cls_hidden_states_activations += cls_activations_
                    sep_hidden_states_activations += sep_activations_
                if self.model.config.output_attentions:
                    attention_heads_activations_ = np.vstack([array[0].contiguous().numpy() for array in encoded_layers[3]])
                    attention_heads_activations += utils.extract_heads_activations_from_token_activations(attention_heads_activations_, mapping, indexes[index])
                    cls_attention_, sep_attention_ = utils.extract_heads_activations_from_special_tokens(attention_heads_activations_, mapping)
                    cls_attention_activations += cls_attention_
                    sep_attention_activations += sep_attention_
        if self.model.config.output_hidden_states:
            hidden_states_activations = pd.DataFrame(np.vstack(hidden_states_activations), columns=['hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT)])
            cls_hidden_states_activations = pd.DataFrame(np.vstack(cls_hidden_states_activations), columns=['CLS-hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT)])
            sep_hidden_states_activations = pd.DataFrame(np.vstack(sep_hidden_states_activations), columns=['SEP-hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT)])
        if self.model.config.output_attentions:
            attention_heads_activations = pd.DataFrame(np.vstack(attention_heads_activations), columns=['attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)])
            cls_attention_activations = pd.DataFrame(np.vstack(cls_attention_activations), columns=['CLS-attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)])
            sep_attention_activations = pd.DataFrame(np.vstack(sep_attention_activations), columns=['SEP-attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)])
        return [hidden_states_activations, 
                attention_heads_activations, 
                cls_hidden_states_activations,
                sep_hidden_states_activations,
                cls_attention_activations,
                sep_attention_activations]

Example #5

Show file

    def get_token_level_activations(self, iterator, language):
        """ Model predictions are generated by batch with small attention masks: the model
        take the whole sentence as input.
        """
        hidden_states_activations = []
        attention_heads_activations = []
        cls_hidden_states_activations = []
        sep_hidden_states_activations = []
        cls_attention_activations = []
        sep_attention_activations = []
        # Here, we give as input the batch of line by batch of line.
        batches, indexes = utils.batchify_per_sentence_with_pre_and_post_context(
            iterator,
            self.config['number_of_sentence'],
            self.config['number_of_sentence_before'],
            self.config['number_of_sentence_after'],
            self.pretrained_bert_model,
            max_length=self.config['max_length'])

        indexes_tmp = []
        # If beginning and end indexes of each sentences are recorded, we only keep the sentence(s) of interest
        for i in range(len(indexes)):
            if type(indexes[i]) == list and type(indexes[i][0]) == list:
                indexes_tmp.append(indexes[i][-1])
            else:
                if i > 0:
                    indexes_tmp.append((
                        indexes[i][-self.config['number_of_sentence'] -
                                   self.config['number_of_sentence_after']][0],
                        indexes[i][-self.config['number_of_sentence'] -
                                   self.config['number_of_sentence_after']][1]
                    ))
                else:
                    indexes_tmp.append(None)

        if self.config['number_of_sentence_before'] == 0:
            indexes_tmp[0] = (indexes[0][0][0][0], indexes[0][-1][1])
        else:
            indexes_tmp[0] = (indexes[0][0][0], indexes[0][-1][1])

        for index_batch, batch in enumerate(batches):
            batch = batch.strip()  # Remove trailing character

            batch = '[CLS] ' + batch + ' [SEP]'
            tokenized_text = self.tokenizer.wordpiece_tokenizer.tokenize(batch)
            inputs_ids = torch.tensor(
                [self.tokenizer.convert_tokens_to_ids(tokenized_text)])
            inputs_ids = torch.cat(inputs_ids.size(1) * [inputs_ids])
            attention_mask = torch.diag_embed(
                torch.tensor([[0 for x in tokenized_text]]))

            for i in range(
                    min(len(tokenized_text), self.attention_length_before)):
                attention_mask = torch.add(
                    attention_mask,
                    torch.diag_embed(torch.tensor(
                        [[1 for x in range(len(tokenized_text) - i)]]),
                                     offset=-i))
            for i in range(
                    1, min(len(tokenized_text),
                           self.attention_length_after + 1)):
                attention_mask = torch.add(
                    attention_mask,
                    torch.diag_embed(torch.tensor(
                        [[1 for x in range(len(tokenized_text) - i)]]),
                                     offset=i))
            mapping = utils.match_tokenized_to_untokenized(
                tokenized_text, batch)

            attention_mask = attention_mask.squeeze(0)

            beg = indexes_tmp[index_batch][
                0] + 1  # because of the special token at the beginning
            end = indexes_tmp[index_batch][1] + 1  # because of special token

            inputs_ids = inputs_ids[beg:end, :]
            attention_mask = attention_mask[beg:end, :]

            dim = inputs_ids.size(1)
            if self.prediction_type == 'control-context-past':
                attention_mask = torch.stack([
                    attention_mask[index, :] * torch.tril(torch.ones(dim, dim))
                    for index in range(attention_mask.size(0))
                ])
            elif self.prediction_type == 'control-context-future':
                attention_mask = torch.stack([
                    attention_mask[index, :] * torch.triu(torch.ones(dim, dim))
                    for index in range(attention_mask.size(0))
                ])

            with torch.no_grad():
                encoded_layers = self.model(
                    inputs_ids, attention_mask=attention_mask
                )  # last_hidden_state, pooler_output, hidden_states, attentions
                # last_hidden_state dimension: (batch_size, sequence_length, hidden_size)
                # pooler_output dimension: (batch_size, hidden_size)
                # hidden_states dimension: num_layers * (batch_size, sequence_length, hidden_size)
                # attentions dimension: num_layers * (batch_size, num_heads, sequence_length, sequence_length)
                # hacked version: attentions dimension: num_layers * [(batch_size, sequence_length, hidden_size),
                #                                                       (batch_size, num_heads, sequence_length, sequence_length)]
                # filtration
                if self.model.config.output_hidden_states:
                    hidden_states_activations_ = np.vstack([
                        torch.cat([
                            encoded_layers[2][layer]
                            [i,
                             len(tokenized_text) -
                             encoded_layers[2][layer].size(0) + i -
                             1, :].unsqueeze(0)
                            for i in range(encoded_layers[2][layer].size(0))
                        ],
                                  dim=0).unsqueeze(0).detach().numpy()
                        for layer in range(len(encoded_layers[2]))
                    ])  # retrieve all the hidden states (dimension = layer_count * len(tokenized_text) * feature_count)
                    hidden_states_activations_ = np.concatenate([
                        np.zeros((hidden_states_activations_.shape[0],
                                  indexes_tmp[index_batch][0] + 1,
                                  hidden_states_activations_.shape[-1])),
                        hidden_states_activations_,
                        np.zeros((hidden_states_activations_.shape[0],
                                  len(tokenized_text) -
                                  indexes_tmp[index_batch][1] - 1,
                                  hidden_states_activations_.shape[-1]))
                    ],
                                                                axis=1)

                    hidden_states_activations += utils.extract_activations_from_token_activations(
                        hidden_states_activations_, mapping,
                        indexes_tmp[index_batch])
                    #cls_activations_, sep_activations_ = utils.extract_activations_from_special_tokens(hidden_states_activations_, mapping)
                    #cls_hidden_states_activations += cls_activations_
                    #sep_hidden_states_activations += sep_activations_
                if self.model.config.output_attentions:
                    raise NotImplementedError('Not yet implemented...')
                    #attention_heads_activations_ = np.vstack([torch.cat([encoded_layers[-1][layer][0][i,len(tokenized_text) - encoded_layers[-1][layer][0].size(0) + i,:].unsqueeze(0) for i in range(encoded_layers[-1][layer][0].size(0))], dim=0).unsqueeze(0).detach().numpy() for layer in range(len(encoded_layers[-1]))])
                    #if indexes_tmp[index_batch][0] > 0:
                    #    attention_heads_activations_ = np.concatenate([np.zeros((attention_heads_activations_.shape[0], indexes_tmp[index_batch][0] , attention_heads_activations_.shape[-1])), attention_heads_activations_], axis=1)
                    #attention_heads_activations_ = attention_heads_activations_.reshape([
                    #    self.NUM_HIDDEN_LAYERS,
                    #    len(tokenized_text),
                    #    self.NUM_ATTENTION_HEADS,
                    #    self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS])
                    #attention_heads_activations_ = np.swapaxes(attention_heads_activations_, 1, 2)
                    #attention_heads_activations += utils.extract_heads_activations_from_token_activations(attention_heads_activations_, mapping, indexes_tmp[index_batch])
                    #cls_attention_, sep_attention_ = utils.extract_heads_activations_from_special_tokens(attention_heads_activations_, mapping)
                    #cls_attention_activations += cls_attention_
                    #sep_attention_activations += sep_attention_
        if self.model.config.output_hidden_states:
            hidden_states_activations = pd.DataFrame(
                np.vstack(hidden_states_activations),
                columns=[
                    'hidden_state-layer-{}-{}'.format(layer, index)
                    for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS)
                    for index in range(1, 1 + self.FEATURE_COUNT)
                ])
            #cls_hidden_states_activations = pd.DataFrame(np.vstack(cls_hidden_states_activations), columns=['CLS-hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT)])
            #sep_hidden_states_activations = pd.DataFrame(np.vstack(sep_hidden_states_activations), columns=['SEP-hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT)])
        if self.model.config.output_attentions:
            raise NotImplementedError('Not yet implemented...')
            #attention_heads_activations = pd.DataFrame(np.vstack(attention_heads_activations), columns=['attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)])
            #cls_attention_activations = pd.DataFrame(np.vstack(cls_attention_activations), columns=['CLS-attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)])
            #sep_attention_activations = pd.DataFrame(np.vstack(sep_attention_activations), columns=['SEP-attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)])
        return [
            hidden_states_activations, attention_heads_activations,
            cls_hidden_states_activations, sep_hidden_states_activations,
            cls_attention_activations, sep_attention_activations
        ]

Example #6

Show file

    def get_classic_activations(self, iterator, language):
        """ Extract hidden state activations of the model for each token from the input, on a 
        word-by-word predictions or sentence-by-sentence prediction.
        Optionally includes surprisal and entropy.
        Input text should have one sentence per line, where each word and every 
        symbol is separated from the following by a space. No <eos> token should be included,
        as they are automatically integrated during tokenization.
        Arguments: 
            - iterator: iterator object, 
            generally: iterator = tokenize(path, language, self.vocab)
            - includ_surprisal: bool specifying if we include surprisal
            - includ_entropy: bool specifying if we include entropy
            - parameters: list (of string representing gate names)
        Returns:
            - result: pd.DataFrame containing activation (+ optionally entropy
            and surprisal)
        """
        hidden_states_activations = []
        attention_heads_activations = []
        cls_hidden_states_activations = []
        sep_hidden_states_activations = []
        cls_attention_activations = []
        sep_attention_activations = []
        # Here, we give as input the batch of line by batch of line.
        batches, indexes = utils.batchify_per_sentence_with_pre_and_post_context(
            iterator,
            self.config['number_of_sentence'],
            self.config['number_of_sentence_before'],
            self.config['number_of_sentence_after'],
            self.pretrained_bert_model,
            max_length=self.config['max_length'],
            stop_attention_before_sent=self.
            config['stop_attention_before_sent'],
            stop_attention_at_sent_before=self.
            config['stop_attention_at_sent_before'])

        indexes_tmp = []
        for i in range(len(indexes)):
            if type(indexes[i]) == list and type(indexes[i][0]) == list:
                indexes_tmp.append(indexes[i][-1])
            else:
                if i > 0:
                    indexes_tmp.append((
                        indexes[i][-self.config['number_of_sentence'] -
                                   self.config['number_of_sentence_after']][0],
                        indexes[i][-self.config['number_of_sentence'] -
                                   self.config['number_of_sentence_after']][1]
                    ))
                else:
                    indexes_tmp.append(None)

        if self.config['number_of_sentence_before'] == 0:
            indexes_tmp[0] = (indexes[0][0][0][0], indexes[0][-1][1])
        else:
            indexes_tmp[0] = (indexes[0][0][0], indexes[0][-1][1])

        for index, batch in enumerate(batches):
            batch = batch.strip()  # Remove trailing character

            batch = '[CLS] ' + batch + ' [SEP]'
            tokenized_text = self.tokenizer.wordpiece_tokenizer.tokenize(batch)
            inputs_ids = torch.tensor(
                [self.tokenizer.convert_tokens_to_ids(tokenized_text)])
            mapping = utils.match_tokenized_to_untokenized(
                tokenized_text, batch)

            if self.prediction_type == 'sentence':
                attention_mask = torch.tensor([[1 for x in tokenized_text]])

                if (self.config['stop_attention_at_sent_before']
                        is not None) and (index > 0) and not (
                            type(indexes[index]) == list
                            and type(indexes[index][0]) == list):
                    start_index = 1 if (
                        index > self.config['number_of_sentence_before'] -
                        self.config['stop_attention_at_sent_before'] -
                        self.config['number_of_sentence']) else 0
                    attention_mask[:, :start_index + indexes[index][
                        -self.config['stop_attention_at_sent_before'] -
                        self.config['number_of_sentence'] -
                        self.config['number_of_sentence_after']][0]] = 0
                    if self.config['stop_attention_before_sent'] < 0:
                        attention_mask[:, start_index + indexes[index][
                            -self.config['stop_attention_at_sent_before'] -
                            self.config['number_of_sentence'] -
                            self.config['number_of_sentence_after']][0]:1 +
                                       indexes[index]
                                       [-self.
                                        config['stop_attention_at_sent_before']
                                        - self.config['number_of_sentence'] -
                                        self.
                                        config['number_of_sentence_after']][0] -
                                       self.
                                       config['stop_attention_before_sent']] = 0
                    elif self.config['stop_attention_before_sent'] > 0:
                        attention_mask[:, start_index + indexes[index][
                            -self.config['stop_attention_at_sent_before'] -
                            self.config['number_of_sentence'] -
                            self.config['number_of_sentence_after']][0] - self.
                                       config['stop_attention_before_sent']:1 +
                                       indexes[index]
                                       [-self.
                                        config['stop_attention_at_sent_before']
                                        - self.config['number_of_sentence'] -
                                        self.config['number_of_sentence_after']]
                                       [0]] = 1
                elif (self.config['stop_attention_at_sent_before']
                      is not None) and index > 0:
                    start_index = 1 if (
                        index > self.config['number_of_sentence_before'] -
                        self.config['stop_attention_at_sent_before'] -
                        self.config['number_of_sentence']) else 0
                    attention_mask[:, :start_index + indexes[index][0][
                        -self.config['stop_attention_at_sent_before'] -
                        self.config['number_of_sentence'] -
                        self.config['number_of_sentence_after']][0]] = 0
                    if self.config['stop_attention_before_sent'] < 0:
                        attention_mask[:, start_index + indexes[index][0][
                            -self.config['stop_attention_at_sent_before'] -
                            self.config['number_of_sentence'] -
                            self.config['number_of_sentence_after']][0]:1 +
                                       indexes[index][0]
                                       [-self.
                                        config['stop_attention_at_sent_before']
                                        - self.config['number_of_sentence'] -
                                        self.
                                        config['number_of_sentence_after']][0] -
                                       self.
                                       config['stop_attention_before_sent']] = 0
                    elif self.config['stop_attention_before_sent'] > 0:
                        attention_mask[:, start_index + indexes[index][0][
                            -self.config['stop_attention_at_sent_before'] -
                            self.config['number_of_sentence'] -
                            self.config['number_of_sentence_after']][0] - self.
                                       config['stop_attention_before_sent']:1 +
                                       indexes[index][0]
                                       [-self.
                                        config['stop_attention_at_sent_before']
                                        - self.config['number_of_sentence'] -
                                        self.config['number_of_sentence_after']]
                                       [0]] = 1

            elif self.prediction_type == 'token-level':
                attention_mask = torch.diag_embed(
                    torch.tensor([0 for x in tokenized_text]))
                for i in range(
                        min(len(tokenized_text),
                            self.attention_length_before)):
                    attention_mask = torch.add(
                        attention_mask,
                        torch.diag_embed(torch.tensor(
                            [1 for x in range(len(tokenized_text) - i)]),
                                         offset=-i))
                for i in range(
                        1,
                        min(len(tokenized_text),
                            self.attention_length_after + 1)):
                    attention_mask = torch.add(
                        attention_mask,
                        torch.diag_embed(torch.tensor(
                            [1 for x in range(len(tokenized_text) - i)]),
                                         offset=i))

                attention_mask = attention_mask.unsqueeze(0)

            with torch.no_grad():
                encoded_layers = self.model(
                    inputs_ids, attention_mask=attention_mask
                )  # last_hidden_state, pooler_output, hidden_states, attentions
                # last_hidden_state dimension: (batch_size, sequence_length, hidden_size)
                # pooler_output dimension: (batch_size, hidden_size)
                # hidden_states dimension: num_layers * (batch_size, sequence_length, hidden_size)
                # attentions dimension: num_layers * (batch_size, num_heads, sequence_length, sequence_length)
                # hacked version: attentions dimension: num_layers * [(batch_size, sequence_length, hidden_size),
                #                                                       (batch_size, num_heads, sequence_length, sequence_length)]
                # filtration
                if self.model.config.output_hidden_states:
                    hidden_states_activations_ = np.vstack(
                        encoded_layers[2]
                    )  # retrieve all the hidden states (dimension = layer_count * len(tokenized_text) * feature_count)
                    hidden_states_activations += utils.extract_activations_from_token_activations(
                        hidden_states_activations_, mapping,
                        indexes_tmp[index])
                    #cls_activations_, sep_activations_ = utils.extract_activations_from_special_tokens(hidden_states_activations_, mapping)
                    #cls_hidden_states_activations += cls_activations_
                    #sep_hidden_states_activations += sep_activations_
                if self.model.config.output_attentions:
                    raise NotImplementedError('Not yet implemented...')
                    #attention_heads_activations_ = np.vstack([array[0].view([
                    #                                            1,
                    #                                            inputs_ids.shape[-1],
                    #                                            self.NUM_ATTENTION_HEADS,
                    #                                            self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS]).permute(0, 2, 1, 3).contiguous()  for array in encoded_layers[3]])
                    #attention_heads_activations += utils.extract_heads_activations_from_token_activations(attention_heads_activations_, mapping, indexes_tmp[index])
                    ##cls_attention_, sep_attention_ = utils.extract_heads_activations_from_special_tokens(attention_heads_activations_, mapping)
                    ##cls_attention_activations += cls_attention_
                    ##sep_attention_activations += sep_attention_
        if self.model.config.output_hidden_states:
            hidden_states_activations = pd.DataFrame(
                np.vstack(hidden_states_activations),
                columns=[
                    'hidden_state-layer-{}-{}'.format(layer, index)
                    for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS)
                    for index in range(1, 1 + self.FEATURE_COUNT)
                ])
            #cls_hidden_states_activations = pd.DataFrame(np.vstack(cls_hidden_states_activations), columns=['CLS-hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT)])
            #sep_hidden_states_activations = pd.DataFrame(np.vstack(sep_hidden_states_activations), columns=['SEP-hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT)])
        if self.model.config.output_attentions:
            raise NotImplementedError('Not yet implemented...')
            #attention_heads_activations = pd.DataFrame(np.vstack(attention_heads_activations), columns=['attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)])
            ##cls_attention_activations = pd.DataFrame(np.vstack(cls_attention_activations), columns=['CLS-attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)])
            ##sep_attention_activations = pd.DataFrame(np.vstack(sep_attention_activations), columns=['SEP-attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)])
        return [
            hidden_states_activations, attention_heads_activations,
            cls_hidden_states_activations, sep_hidden_states_activations,
            cls_attention_activations, sep_attention_activations
        ]