def get_special_activations(self, iterator, language, transformation): """ Model predictions are generated by batch and apply different transformation to the input: the model take the whole sentence as input. For each word, words outside its context window are either: - shuffled - replaced by random words - replaced by words with same POS and relationship dependencies """ hidden_states_activations = [] attention_heads_activations = [] # Here, a batch is juste a sentence because we cannot create batches of equal length due to the transformation batches, indexes = utils.batchify_sentences( iterator, self.config['number_of_sentence'], self.config['number_of_sentence_before'], self.pretrained_gpt2_model, past_context_size=self.config['attention_length_before'], transformation=transformation, vocabulary=self.config['tokens_vocabulary'], dictionary=self.config['pos_dictionary'], seed=self.config['seed'], max_length=self.config['max_length']) for index_batch, batch in enumerate(batches): batch = batch.strip() # Remove trailing character batch = '<|endoftext|> ' + batch + ' <|endoftext|>' tokenized_text = self.tokenizer.tokenize(batch, add_prefix_space=False) #print('Batch number: ', index_batch, ' - ' , batch) #print(tokenized_text) #print('indexes:', indexes[index_batch], tokenized_text[indexes[index_batch][0]:indexes[index_batch][1]]) #print() inputs_ids = torch.tensor( [self.tokenizer.convert_tokens_to_ids(tokenized_text)]) mapping = utils.match_tokenized_to_untokenized( tokenized_text, batch) with torch.no_grad(): encoded_layers = self.model( inputs_ids ) # last_hidden_state, pooler_output, hidden_states, attentions if self.model.config.output_hidden_states: hidden_states_activations_ = np.vstack( encoded_layers[2] ) # retrieve all the hidden states (dimension = layer_count * len(tokenized_text) * feature_count) hidden_states_activations += utils.extract_activations_from_token_activations_special( hidden_states_activations_, mapping, indexes[index_batch] ) #verify if we have to add 1 to indexes values if self.model.config.output_attentions: raise NotImplementedError('Not yet implemented...') if self.model.config.output_hidden_states: hidden_states_activations = pd.DataFrame( np.vstack(hidden_states_activations), columns=[ 'hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT) ]) if self.model.config.output_attentions: raise NotImplementedError('Not yet implemented...') return [hidden_states_activations, attention_heads_activations]
def get_token_level_activations(self, iterator, language): """ Model predictions are generated by batch with small attention masks: the model take the whole sentence as input. """ hidden_states_activations = [] attention_heads_activations = [] # Here, we give as input the batch of line by batch of line. batches, indexes = utils.batchify_with_detailed_indexes( iterator, self.config['number_of_sentence'], self.config['number_of_sentence_before'], self.pretrained_gpt2_model, max_length=self.config['max_length'], add_prefix_space=self.add_prefix_space) indexes_tmp = [(indexes[i][-self.config['number_of_sentence']][0], indexes[i][-1][1]) for i in range(len(indexes))] indexes_tmp[0] = (indexes[0][0][0], indexes[0][-1][1]) # we ad 1 because of the initial special token for i in range(len(indexes_tmp)): indexes_tmp[i] = (indexes_tmp[i][0] + 1, indexes_tmp[i][1] + 1) for index_batch, batch in enumerate(batches): batch = batch.strip() # Remove trailing character batch = '<|endoftext|> ' + batch + ' <|endoftext|>' tokenized_text = self.tokenizer.tokenize(batch, add_prefix_space=False) mapping = utils.match_tokenized_to_untokenized( tokenized_text, batch) beg = indexes_tmp[index_batch][0] end = indexes_tmp[index_batch][1] inputs_ids = torch.tensor( [self.tokenizer.convert_tokens_to_ids(tokenized_text)]) inputs_ids = torch.cat(inputs_ids.size(1) * [inputs_ids]) inputs_ids = inputs_ids[beg:end, :] attention_mask = torch.diag_embed( torch.tensor([0 for x in tokenized_text])) for i in range( min(len(tokenized_text), self.attention_length_before)): attention_mask = torch.add( attention_mask, torch.diag_embed(torch.tensor( [1 for x in range(len(tokenized_text) - i)]), offset=-i)) attention_mask = attention_mask[beg:end, :] with torch.no_grad(): encoded_layers = self.model( inputs_ids, attention_mask=attention_mask ) # last_hidden_state, pooler_output, hidden_states, attentions # last_hidden_state dimension: (batch_size, sequence_length, hidden_size) # pooler_output dimension: (batch_size, hidden_size) # hidden_states dimension: num_layers * (batch_size, sequence_length, hidden_size) # attentions dimension: num_layers * (batch_size, num_heads, sequence_length, sequence_length) # hacked version: attentions dimension: num_layers * [(batch_size, sequence_length, hidden_size), # (batch_size, num_heads, sequence_length, sequence_length)] # filtration if self.model.config.output_hidden_states: hidden_states_activations_ = np.vstack([ torch.cat([ encoded_layers[2][layer] [i, len(tokenized_text) - encoded_layers[2][layer].size(0) + i - 1, :].unsqueeze(0) for i in range(encoded_layers[2][layer].size(0)) ], dim=0).unsqueeze(0).detach().numpy() for layer in range(len(encoded_layers[2])) ]) hidden_states_activations_ = np.concatenate([ np.zeros((hidden_states_activations_.shape[0], indexes_tmp[index_batch][0], hidden_states_activations_.shape[-1])), hidden_states_activations_, np.zeros( (hidden_states_activations_.shape[0], len(tokenized_text) - indexes_tmp[index_batch][1], hidden_states_activations_.shape[-1])) ], axis=1) # retrieve all the hidden states (dimension = layer_count * len(tokenized_text) * feature_count) hidden_states_activations += utils.extract_activations_from_token_activations( hidden_states_activations_, mapping, indexes_tmp[index_batch]) if self.model.config.output_attentions: raise NotImplementedError('Not yet implemented...') #attention_heads_activations_ = np.vstack([torch.cat([encoded_layers[-1][layer][0][i,:,i,:].unsqueeze(0) for i in range(len(tokenized_text))], dim=0).unsqueeze(0).detach().numpy() for layer in range(len(encoded_layers[-1]))]) #attention_heads_activations_ = np.swapaxes(attention_heads_activations_, 1, 2) #attention_heads_activations += utils.extract_heads_activations_from_token_activations(attention_heads_activations_, mapping, indexes_tmp[index_batch]) if self.model.config.output_hidden_states: hidden_states_activations = pd.DataFrame( np.vstack(hidden_states_activations), columns=[ 'hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT) ]) if self.model.config.output_attentions: raise NotImplementedError('Not yet implemented...') #attention_heads_activations = pd.DataFrame(np.vstack(attention_heads_activations), columns=['attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)]) return [hidden_states_activations, attention_heads_activations]
def get_classic_activations(self, iterator, language): """ Extract hidden state activations of the model for each token from the input, on a word-by-word predictions or sentence-by-sentence prediction. Optionally includes surprisal and entropy. Input text should have one sentence per line, where each word and every symbol is separated from the following by a space. No <eos> token should be included, as they are automatically integrated during tokenization. Arguments: - iterator: iterator object, generally: iterator = tokenize(path, language, self.vocab) - includ_surprisal: bool specifying if we include surprisal - includ_entropy: bool specifying if we include entropy - parameters: list (of string representing gate names) Returns: - result: pd.DataFrame containing activation (+ optionally entropy and surprisal) """ hidden_states_activations = [] attention_heads_activations = [] # Here, we give as input the batch of line by batch of line. batches, indexes = utils.batchify_with_detailed_indexes( iterator, self.config['number_of_sentence'], self.config['number_of_sentence_before'], self.pretrained_gpt2_model, max_length=self.config['max_length'], stop_attention_at_sent=self.config['stop_attention_at_sent'], stop_attention_before_sent=self. config['stop_attention_before_sent'], add_prefix_space=self.add_prefix_space) indexes_tmp = [(indexes[i][-self.config['number_of_sentence']][0], indexes[i][-1][1]) for i in range(len(indexes))] indexes_tmp[0] = (indexes[0][0][0], indexes[0][-1][1]) for i in range(len(indexes_tmp)): indexes_tmp[i] = (indexes_tmp[i][0] + 1, indexes_tmp[i][1] + 1) for index, batch in enumerate(batches): batch = batch.strip() # Remove trailing character batch = '<|endoftext|> ' + batch + ' <|endoftext|>' tokenized_text = self.tokenizer.tokenize(batch, add_prefix_space=False) mapping = utils.match_tokenized_to_untokenized( tokenized_text, batch) inputs_ids = torch.tensor( [self.tokenizer.convert_tokens_to_ids(tokenized_text)]) if self.prediction_type == 'sentence': attention_mask = torch.tensor([[1 for x in tokenized_text]]) if (self.config['stop_attention_at_sent'] is not None) and (index > 0): attention_mask[:, :indexes[index] [-self.config['stop_attention_at_sent'] - self.config['number_of_sentence']][0]] = 0 if self.config['stop_attention_before_sent'] < 0: attention_mask[:, 1 + indexes[index][ -self.config['stop_attention_at_sent'] - self.config['number_of_sentence']][0]:1 + indexes[index] [-self.config['stop_attention_at_sent'] - self.config['number_of_sentence']][0] - self. config['stop_attention_before_sent']] = 0 elif self.config['stop_attention_before_sent'] > 0: attention_mask[:, 1 + indexes[index][ -self.config['stop_attention_at_sent'] - self.config['number_of_sentence']][0] - self. config['stop_attention_before_sent']:1 + indexes[index] [-self.config['stop_attention_at_sent'] - self. config['number_of_sentence']][0]] = 1 elif 'token-level' in self.prediction_type: attention_mask = torch.diag_embed( torch.tensor([0 for x in tokenized_text])) for i in range( min(len(tokenized_text), self.attention_length_before)): attention_mask = torch.add( attention_mask, torch.diag_embed(torch.tensor( [1 for x in range(len(tokenized_text) - i)]), offset=-i)) attention_mask = attention_mask.unsqueeze(0) if 'reverse' in self.prediction_type: attention_mask = 1 - attention_mask with torch.no_grad(): encoded_layers = self.model( inputs_ids, attention_mask=attention_mask ) # last_hidden_state, pooler_output, hidden_states, attentions # last_hidden_state dimension: (batch_size, sequence_length, hidden_size) # pooler_output dimension: (batch_size, hidden_size) # hidden_states dimension: num_layers * (batch_size, sequence_length, hidden_size) # attentions dimension: num_layers * (batch_size, num_heads, sequence_length, sequence_length) # hacked version: attentions dimension: num_layers * [(batch_size, sequence_length, hidden_size), # (batch_size, num_heads, sequence_length, sequence_length)] # filtration if self.model.config.output_hidden_states: hidden_states_activations_ = np.vstack( encoded_layers[2] ) # retrieve all the hidden states (dimension = layer_count * len(tokenized_text) * feature_count) hidden_states_activations += utils.extract_activations_from_token_activations( hidden_states_activations_, mapping, indexes_tmp[index]) if self.model.config.output_attentions: attention_heads_activations_ = np.vstack( [array[0] for array in encoded_layers[3]]) attention_heads_activations += utils.extract_heads_activations_from_token_activations( attention_heads_activations_, mapping, indexes_tmp[index]) if self.model.config.output_hidden_states: hidden_states_activations = pd.DataFrame( np.vstack(hidden_states_activations), columns=[ 'hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT) ]) if self.model.config.output_attentions: attention_heads_activations = pd.DataFrame( np.vstack(attention_heads_activations), columns=[ 'attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range( 1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS) ]) return [hidden_states_activations, attention_heads_activations]
def get_classic_activations(self, iterator, language): """ Model predictions are generated in the classical way: the model take the whole sentence as input. """ hidden_states_activations = [] attention_heads_activations = [] cls_hidden_states_activations = [] sep_hidden_states_activations = [] cls_attention_activations = [] sep_attention_activations = [] # Here, we give as input the batch of line by batch of line. batches, indexes = utils.batchify_per_sentence_with_pre_and_post_context( iterator, self.tokenizer, self.config['number_of_sentence'], self.config['number_of_sentence_before'], self.config['number_of_sentence_after'], max_length=self.config['max_length']) for index, batch in enumerate(batches): batch = batch.strip() # Remove trailing character batch = '[CLS] ' + batch + ' [SEP]' tokenized_text = self.tokenizer.wordpiece_tokenizer.tokenize(batch) inputs_ids = torch.tensor([self.tokenizer.convert_tokens_to_ids(tokenized_text)]) attention_mask = torch.tensor([[1 for x in tokenized_text]]) mapping = utils.match_tokenized_to_untokenized(tokenized_text, batch) with torch.no_grad(): encoded_layers = self.model(inputs_ids, attention_mask=attention_mask) # last_hidden_state, pooler_output, hidden_states, attentions # last_hidden_state dimension: (batch_size, sequence_length, hidden_size) # hidden_states dimension: num_layers * (batch_size, sequence_length, hidden_size) # attentions dimension: num_layers * (batch_size, num_heads, sequence_length, sequence_length) # hacked version: attentions dimension: num_layers * [(batch_size, sequence_length, hidden_size), # (batch_size, num_heads, sequence_length, sequence_length)] # filtration if self.model.config.output_hidden_states: hidden_states_activations_ = np.vstack(encoded_layers[1]) # retrieve all the hidden states (dimension = layer_count * len(tokenized_text) * feature_count) hidden_states_activations += utils.extract_activations_from_token_activations(hidden_states_activations_, mapping, indexes[index]) cls_activations_, sep_activations_ = utils.extract_activations_from_special_tokens(hidden_states_activations_, mapping) cls_hidden_states_activations += cls_activations_ sep_hidden_states_activations += sep_activations_ if self.model.config.output_attentions: attention_heads_activations_ = np.vstack([array[0].contiguous().numpy() for array in encoded_layers[3]]) attention_heads_activations += utils.extract_heads_activations_from_token_activations(attention_heads_activations_, mapping, indexes[index]) cls_attention_, sep_attention_ = utils.extract_heads_activations_from_special_tokens(attention_heads_activations_, mapping) cls_attention_activations += cls_attention_ sep_attention_activations += sep_attention_ if self.model.config.output_hidden_states: hidden_states_activations = pd.DataFrame(np.vstack(hidden_states_activations), columns=['hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT)]) cls_hidden_states_activations = pd.DataFrame(np.vstack(cls_hidden_states_activations), columns=['CLS-hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT)]) sep_hidden_states_activations = pd.DataFrame(np.vstack(sep_hidden_states_activations), columns=['SEP-hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT)]) if self.model.config.output_attentions: attention_heads_activations = pd.DataFrame(np.vstack(attention_heads_activations), columns=['attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)]) cls_attention_activations = pd.DataFrame(np.vstack(cls_attention_activations), columns=['CLS-attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)]) sep_attention_activations = pd.DataFrame(np.vstack(sep_attention_activations), columns=['SEP-attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)]) return [hidden_states_activations, attention_heads_activations, cls_hidden_states_activations, sep_hidden_states_activations, cls_attention_activations, sep_attention_activations]
def get_token_level_activations(self, iterator, language): """ Model predictions are generated by batch with small attention masks: the model take the whole sentence as input. """ hidden_states_activations = [] attention_heads_activations = [] cls_hidden_states_activations = [] sep_hidden_states_activations = [] cls_attention_activations = [] sep_attention_activations = [] # Here, we give as input the batch of line by batch of line. batches, indexes = utils.batchify_per_sentence_with_pre_and_post_context( iterator, self.config['number_of_sentence'], self.config['number_of_sentence_before'], self.config['number_of_sentence_after'], self.pretrained_bert_model, max_length=self.config['max_length']) indexes_tmp = [] # If beginning and end indexes of each sentences are recorded, we only keep the sentence(s) of interest for i in range(len(indexes)): if type(indexes[i]) == list and type(indexes[i][0]) == list: indexes_tmp.append(indexes[i][-1]) else: if i > 0: indexes_tmp.append(( indexes[i][-self.config['number_of_sentence'] - self.config['number_of_sentence_after']][0], indexes[i][-self.config['number_of_sentence'] - self.config['number_of_sentence_after']][1] )) else: indexes_tmp.append(None) if self.config['number_of_sentence_before'] == 0: indexes_tmp[0] = (indexes[0][0][0][0], indexes[0][-1][1]) else: indexes_tmp[0] = (indexes[0][0][0], indexes[0][-1][1]) for index_batch, batch in enumerate(batches): batch = batch.strip() # Remove trailing character batch = '[CLS] ' + batch + ' [SEP]' tokenized_text = self.tokenizer.wordpiece_tokenizer.tokenize(batch) inputs_ids = torch.tensor( [self.tokenizer.convert_tokens_to_ids(tokenized_text)]) inputs_ids = torch.cat(inputs_ids.size(1) * [inputs_ids]) attention_mask = torch.diag_embed( torch.tensor([[0 for x in tokenized_text]])) for i in range( min(len(tokenized_text), self.attention_length_before)): attention_mask = torch.add( attention_mask, torch.diag_embed(torch.tensor( [[1 for x in range(len(tokenized_text) - i)]]), offset=-i)) for i in range( 1, min(len(tokenized_text), self.attention_length_after + 1)): attention_mask = torch.add( attention_mask, torch.diag_embed(torch.tensor( [[1 for x in range(len(tokenized_text) - i)]]), offset=i)) mapping = utils.match_tokenized_to_untokenized( tokenized_text, batch) attention_mask = attention_mask.squeeze(0) beg = indexes_tmp[index_batch][ 0] + 1 # because of the special token at the beginning end = indexes_tmp[index_batch][1] + 1 # because of special token inputs_ids = inputs_ids[beg:end, :] attention_mask = attention_mask[beg:end, :] dim = inputs_ids.size(1) if self.prediction_type == 'control-context-past': attention_mask = torch.stack([ attention_mask[index, :] * torch.tril(torch.ones(dim, dim)) for index in range(attention_mask.size(0)) ]) elif self.prediction_type == 'control-context-future': attention_mask = torch.stack([ attention_mask[index, :] * torch.triu(torch.ones(dim, dim)) for index in range(attention_mask.size(0)) ]) with torch.no_grad(): encoded_layers = self.model( inputs_ids, attention_mask=attention_mask ) # last_hidden_state, pooler_output, hidden_states, attentions # last_hidden_state dimension: (batch_size, sequence_length, hidden_size) # pooler_output dimension: (batch_size, hidden_size) # hidden_states dimension: num_layers * (batch_size, sequence_length, hidden_size) # attentions dimension: num_layers * (batch_size, num_heads, sequence_length, sequence_length) # hacked version: attentions dimension: num_layers * [(batch_size, sequence_length, hidden_size), # (batch_size, num_heads, sequence_length, sequence_length)] # filtration if self.model.config.output_hidden_states: hidden_states_activations_ = np.vstack([ torch.cat([ encoded_layers[2][layer] [i, len(tokenized_text) - encoded_layers[2][layer].size(0) + i - 1, :].unsqueeze(0) for i in range(encoded_layers[2][layer].size(0)) ], dim=0).unsqueeze(0).detach().numpy() for layer in range(len(encoded_layers[2])) ]) # retrieve all the hidden states (dimension = layer_count * len(tokenized_text) * feature_count) hidden_states_activations_ = np.concatenate([ np.zeros((hidden_states_activations_.shape[0], indexes_tmp[index_batch][0] + 1, hidden_states_activations_.shape[-1])), hidden_states_activations_, np.zeros((hidden_states_activations_.shape[0], len(tokenized_text) - indexes_tmp[index_batch][1] - 1, hidden_states_activations_.shape[-1])) ], axis=1) hidden_states_activations += utils.extract_activations_from_token_activations( hidden_states_activations_, mapping, indexes_tmp[index_batch]) #cls_activations_, sep_activations_ = utils.extract_activations_from_special_tokens(hidden_states_activations_, mapping) #cls_hidden_states_activations += cls_activations_ #sep_hidden_states_activations += sep_activations_ if self.model.config.output_attentions: raise NotImplementedError('Not yet implemented...') #attention_heads_activations_ = np.vstack([torch.cat([encoded_layers[-1][layer][0][i,len(tokenized_text) - encoded_layers[-1][layer][0].size(0) + i,:].unsqueeze(0) for i in range(encoded_layers[-1][layer][0].size(0))], dim=0).unsqueeze(0).detach().numpy() for layer in range(len(encoded_layers[-1]))]) #if indexes_tmp[index_batch][0] > 0: # attention_heads_activations_ = np.concatenate([np.zeros((attention_heads_activations_.shape[0], indexes_tmp[index_batch][0] , attention_heads_activations_.shape[-1])), attention_heads_activations_], axis=1) #attention_heads_activations_ = attention_heads_activations_.reshape([ # self.NUM_HIDDEN_LAYERS, # len(tokenized_text), # self.NUM_ATTENTION_HEADS, # self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS]) #attention_heads_activations_ = np.swapaxes(attention_heads_activations_, 1, 2) #attention_heads_activations += utils.extract_heads_activations_from_token_activations(attention_heads_activations_, mapping, indexes_tmp[index_batch]) #cls_attention_, sep_attention_ = utils.extract_heads_activations_from_special_tokens(attention_heads_activations_, mapping) #cls_attention_activations += cls_attention_ #sep_attention_activations += sep_attention_ if self.model.config.output_hidden_states: hidden_states_activations = pd.DataFrame( np.vstack(hidden_states_activations), columns=[ 'hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT) ]) #cls_hidden_states_activations = pd.DataFrame(np.vstack(cls_hidden_states_activations), columns=['CLS-hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT)]) #sep_hidden_states_activations = pd.DataFrame(np.vstack(sep_hidden_states_activations), columns=['SEP-hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT)]) if self.model.config.output_attentions: raise NotImplementedError('Not yet implemented...') #attention_heads_activations = pd.DataFrame(np.vstack(attention_heads_activations), columns=['attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)]) #cls_attention_activations = pd.DataFrame(np.vstack(cls_attention_activations), columns=['CLS-attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)]) #sep_attention_activations = pd.DataFrame(np.vstack(sep_attention_activations), columns=['SEP-attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)]) return [ hidden_states_activations, attention_heads_activations, cls_hidden_states_activations, sep_hidden_states_activations, cls_attention_activations, sep_attention_activations ]
def get_classic_activations(self, iterator, language): """ Extract hidden state activations of the model for each token from the input, on a word-by-word predictions or sentence-by-sentence prediction. Optionally includes surprisal and entropy. Input text should have one sentence per line, where each word and every symbol is separated from the following by a space. No <eos> token should be included, as they are automatically integrated during tokenization. Arguments: - iterator: iterator object, generally: iterator = tokenize(path, language, self.vocab) - includ_surprisal: bool specifying if we include surprisal - includ_entropy: bool specifying if we include entropy - parameters: list (of string representing gate names) Returns: - result: pd.DataFrame containing activation (+ optionally entropy and surprisal) """ hidden_states_activations = [] attention_heads_activations = [] cls_hidden_states_activations = [] sep_hidden_states_activations = [] cls_attention_activations = [] sep_attention_activations = [] # Here, we give as input the batch of line by batch of line. batches, indexes = utils.batchify_per_sentence_with_pre_and_post_context( iterator, self.config['number_of_sentence'], self.config['number_of_sentence_before'], self.config['number_of_sentence_after'], self.pretrained_bert_model, max_length=self.config['max_length'], stop_attention_before_sent=self. config['stop_attention_before_sent'], stop_attention_at_sent_before=self. config['stop_attention_at_sent_before']) indexes_tmp = [] for i in range(len(indexes)): if type(indexes[i]) == list and type(indexes[i][0]) == list: indexes_tmp.append(indexes[i][-1]) else: if i > 0: indexes_tmp.append(( indexes[i][-self.config['number_of_sentence'] - self.config['number_of_sentence_after']][0], indexes[i][-self.config['number_of_sentence'] - self.config['number_of_sentence_after']][1] )) else: indexes_tmp.append(None) if self.config['number_of_sentence_before'] == 0: indexes_tmp[0] = (indexes[0][0][0][0], indexes[0][-1][1]) else: indexes_tmp[0] = (indexes[0][0][0], indexes[0][-1][1]) for index, batch in enumerate(batches): batch = batch.strip() # Remove trailing character batch = '[CLS] ' + batch + ' [SEP]' tokenized_text = self.tokenizer.wordpiece_tokenizer.tokenize(batch) inputs_ids = torch.tensor( [self.tokenizer.convert_tokens_to_ids(tokenized_text)]) mapping = utils.match_tokenized_to_untokenized( tokenized_text, batch) if self.prediction_type == 'sentence': attention_mask = torch.tensor([[1 for x in tokenized_text]]) if (self.config['stop_attention_at_sent_before'] is not None) and (index > 0) and not ( type(indexes[index]) == list and type(indexes[index][0]) == list): start_index = 1 if ( index > self.config['number_of_sentence_before'] - self.config['stop_attention_at_sent_before'] - self.config['number_of_sentence']) else 0 attention_mask[:, :start_index + indexes[index][ -self.config['stop_attention_at_sent_before'] - self.config['number_of_sentence'] - self.config['number_of_sentence_after']][0]] = 0 if self.config['stop_attention_before_sent'] < 0: attention_mask[:, start_index + indexes[index][ -self.config['stop_attention_at_sent_before'] - self.config['number_of_sentence'] - self.config['number_of_sentence_after']][0]:1 + indexes[index] [-self. config['stop_attention_at_sent_before'] - self.config['number_of_sentence'] - self. config['number_of_sentence_after']][0] - self. config['stop_attention_before_sent']] = 0 elif self.config['stop_attention_before_sent'] > 0: attention_mask[:, start_index + indexes[index][ -self.config['stop_attention_at_sent_before'] - self.config['number_of_sentence'] - self.config['number_of_sentence_after']][0] - self. config['stop_attention_before_sent']:1 + indexes[index] [-self. config['stop_attention_at_sent_before'] - self.config['number_of_sentence'] - self.config['number_of_sentence_after']] [0]] = 1 elif (self.config['stop_attention_at_sent_before'] is not None) and index > 0: start_index = 1 if ( index > self.config['number_of_sentence_before'] - self.config['stop_attention_at_sent_before'] - self.config['number_of_sentence']) else 0 attention_mask[:, :start_index + indexes[index][0][ -self.config['stop_attention_at_sent_before'] - self.config['number_of_sentence'] - self.config['number_of_sentence_after']][0]] = 0 if self.config['stop_attention_before_sent'] < 0: attention_mask[:, start_index + indexes[index][0][ -self.config['stop_attention_at_sent_before'] - self.config['number_of_sentence'] - self.config['number_of_sentence_after']][0]:1 + indexes[index][0] [-self. config['stop_attention_at_sent_before'] - self.config['number_of_sentence'] - self. config['number_of_sentence_after']][0] - self. config['stop_attention_before_sent']] = 0 elif self.config['stop_attention_before_sent'] > 0: attention_mask[:, start_index + indexes[index][0][ -self.config['stop_attention_at_sent_before'] - self.config['number_of_sentence'] - self.config['number_of_sentence_after']][0] - self. config['stop_attention_before_sent']:1 + indexes[index][0] [-self. config['stop_attention_at_sent_before'] - self.config['number_of_sentence'] - self.config['number_of_sentence_after']] [0]] = 1 elif self.prediction_type == 'token-level': attention_mask = torch.diag_embed( torch.tensor([0 for x in tokenized_text])) for i in range( min(len(tokenized_text), self.attention_length_before)): attention_mask = torch.add( attention_mask, torch.diag_embed(torch.tensor( [1 for x in range(len(tokenized_text) - i)]), offset=-i)) for i in range( 1, min(len(tokenized_text), self.attention_length_after + 1)): attention_mask = torch.add( attention_mask, torch.diag_embed(torch.tensor( [1 for x in range(len(tokenized_text) - i)]), offset=i)) attention_mask = attention_mask.unsqueeze(0) with torch.no_grad(): encoded_layers = self.model( inputs_ids, attention_mask=attention_mask ) # last_hidden_state, pooler_output, hidden_states, attentions # last_hidden_state dimension: (batch_size, sequence_length, hidden_size) # pooler_output dimension: (batch_size, hidden_size) # hidden_states dimension: num_layers * (batch_size, sequence_length, hidden_size) # attentions dimension: num_layers * (batch_size, num_heads, sequence_length, sequence_length) # hacked version: attentions dimension: num_layers * [(batch_size, sequence_length, hidden_size), # (batch_size, num_heads, sequence_length, sequence_length)] # filtration if self.model.config.output_hidden_states: hidden_states_activations_ = np.vstack( encoded_layers[2] ) # retrieve all the hidden states (dimension = layer_count * len(tokenized_text) * feature_count) hidden_states_activations += utils.extract_activations_from_token_activations( hidden_states_activations_, mapping, indexes_tmp[index]) #cls_activations_, sep_activations_ = utils.extract_activations_from_special_tokens(hidden_states_activations_, mapping) #cls_hidden_states_activations += cls_activations_ #sep_hidden_states_activations += sep_activations_ if self.model.config.output_attentions: raise NotImplementedError('Not yet implemented...') #attention_heads_activations_ = np.vstack([array[0].view([ # 1, # inputs_ids.shape[-1], # self.NUM_ATTENTION_HEADS, # self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS]).permute(0, 2, 1, 3).contiguous() for array in encoded_layers[3]]) #attention_heads_activations += utils.extract_heads_activations_from_token_activations(attention_heads_activations_, mapping, indexes_tmp[index]) ##cls_attention_, sep_attention_ = utils.extract_heads_activations_from_special_tokens(attention_heads_activations_, mapping) ##cls_attention_activations += cls_attention_ ##sep_attention_activations += sep_attention_ if self.model.config.output_hidden_states: hidden_states_activations = pd.DataFrame( np.vstack(hidden_states_activations), columns=[ 'hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT) ]) #cls_hidden_states_activations = pd.DataFrame(np.vstack(cls_hidden_states_activations), columns=['CLS-hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT)]) #sep_hidden_states_activations = pd.DataFrame(np.vstack(sep_hidden_states_activations), columns=['SEP-hidden_state-layer-{}-{}'.format(layer, index) for layer in np.arange(1 + self.NUM_HIDDEN_LAYERS) for index in range(1, 1 + self.FEATURE_COUNT)]) if self.model.config.output_attentions: raise NotImplementedError('Not yet implemented...') #attention_heads_activations = pd.DataFrame(np.vstack(attention_heads_activations), columns=['attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)]) ##cls_attention_activations = pd.DataFrame(np.vstack(cls_attention_activations), columns=['CLS-attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)]) ##sep_attention_activations = pd.DataFrame(np.vstack(sep_attention_activations), columns=['SEP-attention-layer-{}-head-{}-{}'.format(layer, head, index) for layer in np.arange(1, 1 + self.NUM_HIDDEN_LAYERS) for head in range(1, 1 + self.NUM_ATTENTION_HEADS) for index in range(1, 1 + self.FEATURE_COUNT // self.NUM_ATTENTION_HEADS)]) return [ hidden_states_activations, attention_heads_activations, cls_hidden_states_activations, sep_hidden_states_activations, cls_attention_activations, sep_attention_activations ]