def validate(model: GTransformer, iterator, text_encoder: WhitespaceEncoder) -> None: """ Function that computes the loss over the validation set. :param model: Sequence-to-sequence transformer model. :param iterator: Iterator object over the test Dataset. :param text_encoder: Torch NLP text encoder for tokenization and vectorization. """ total_loss, steps = 0, 0 # Testing with torch.no_grad(): model.train(False) for sample in iterator: # 1) Prepare Sample src, src_lengths, trg, shifted_trg, trg_lengths = prepare_sample( sample, text_encoder) # 2) Run model lprobs = model( src=src.cuda(), trg=shifted_trg.cuda(), src_mask=lengths_to_mask(src_lengths).unsqueeze(1).cuda(), trg_mask=lengths_to_mask(trg_lengths).unsqueeze(1).cuda()) # 3) Compute loss loss = F.nll_loss(lprobs.transpose(2, 1), trg.cuda(), reduction='mean') # 4) Update training metrics total_loss += float(loss.item()) steps += int(trg.ne(PAD_IDX).sum()) print(f'-- total test loss {total_loss:.4}') print(f'-- test steps {steps}')
def train_loop(configs: dict, model: GTransformer, opt: torch.optim.Adam, train: Dataset, test: Dataset, text_encoder: WhitespaceEncoder) -> GTransformer: """ Main training loop. :param configs: Configs defined on the default.yaml file. :param model: Sequence-to-sequence transformer. :param opt: Adam optimizer. :param train: The dataset used for training. :param test: The dataset used for validation. :param text_encoder: Torch NLP text encoder for tokenization and vectorization. """ for e in range(configs.get('num_epochs', 8)): print(f'\n Epoch {e}') model.train() nr_batches = math.ceil(len(train) / configs.get('batch_size', 8)) train_iter, test_iter = get_iterators(configs, train, test) total_loss, steps = 0, 0 for sample in tqdm.tqdm(train_iter, total=nr_batches): # 0) Zero out previous grads opt.zero_grad() # 1) Prepare Sample src, src_lengths, trg, shifted_trg, trg_lengths = prepare_sample( sample, text_encoder) # 2) Run model lprobs = model( src=src.cuda(), trg=shifted_trg.cuda(), src_mask=lengths_to_mask(src_lengths).unsqueeze(1).cuda(), trg_mask=lengths_to_mask(trg_lengths).unsqueeze(1).cuda()) # 3) Compute loss loss = F.nll_loss(lprobs.transpose(2, 1), trg.cuda(), reduction='mean') loss.backward() # 4) Update training metrics total_loss += float(loss.item()) steps += int(trg.ne(0).sum()) # 5) clip gradients # - If the total gradient vector has a length > 1, we clip it back down to 1. if configs.get('gradient_clipping', -1) > 0.0: nn.utils.clip_grad_norm_(model.parameters(), configs.get('gradient_clipping')) # 6) Optim step opt.step() print(f'-- total train loss {total_loss:.4}') total_steps = steps * (e + 1) print(f'-- train steps {total_steps}') validate(model, test_iter, text_encoder) return model
def forward(self, tokens, lengths): """ Usual pytorch forward function. :param tokens: text sequences [batch_size x src_seq_len] :param lengths: source lengths [batch_size] Returns: Dictionary with model outputs (e.g: logits) """ tokens = tokens[:, :lengths.max()] # When using just one GPU this should not change behavior # but when splitting batches across GPU the tokens have padding # from the entire original batch mask = lengths_to_mask(lengths, device=tokens.device) # Run BERT model. word_embeddings = self.transformer(tokens, mask)[0] # Average Pooling word_embeddings = mask_fill(0.0, tokens, word_embeddings, self.tokenizer.padding_index) sentemb = torch.sum(word_embeddings, 1) sum_mask = mask.unsqueeze(-1).expand( word_embeddings.size()).float().sum(1) sentemb = sentemb / sum_mask return {"logits": self.classification_head(sentemb)}
def forward(self, tokens: torch.tensor, lengths: torch.tensor, **kwargs) -> dict: """ Encodes a batch of sequences. :param tokens: Torch tensor with the input sequences [batch_size x seq_len]. :param lengths: Torch tensor with the length of each sequence [seq_len]. Returns: - 'sentemb': tensor [batch_size x 1024] with the sentence encoding. - 'wordemb': tensor [batch_size x seq_len x 1024] with the word level embeddings. - 'all_layers': List with the word_embeddings returned by each layer. - 'mask': torch.Tensor [seq_len x batch_size] - 'extra': tuple with the last_hidden_state [batch_size x seq_len x hidden_size], the pooler_output representing the entire sentence and the word embeddings for all XLM-R layers (list of tensors [batch_size x seq_len x hidden_size]) """ mask = lengths_to_mask(lengths, device=tokens.device) # Run RoBERTa model. last_hidden_states, pooler_output, all_layers = self.model(tokens, mask) return { "sentemb": pooler_output, "wordemb": last_hidden_states, "all_layers": all_layers, "mask": mask, "extra": (last_hidden_states, pooler_output, all_layers), }
def forward(self, tokens: torch.tensor, lengths: torch.tensor, **kwargs) -> dict: """ Encodes a batch of sequences. :param tokens: Torch tensor with the input sequences [batch_size x seq_len]. :param lengths: Torch tensor with the length of each sequence [seq_len]. Returns: - 'sentemb': tensor [batch_size x 1024] with the sentence encoding. - 'wordemb': tensor [batch_size x seq_len x 1024] with the word level embeddings. - 'all_layers': List with the word_embeddings returned by each layer. - 'mask': torch.Tensor [seq_len x batch_size] - 'extra': tuple with all XLM-R layers (list of tensors [batch_size x seq_len x hidden_size]) """ mask = lengths_to_mask(lengths, device=tokens.device) # Run RoBERTa model. all_layers = self.model.extract_features(tokens, return_all_hiddens=True) return { "sentemb": all_layers[-1][:, 0, :], "wordemb": all_layers[-1], "all_layers": all_layers, "mask": mask, "extra": (all_layers), }
def test_lengths_to_mask(): assert lengths_to_mask([3]).sum() == 3 assert lengths_to_mask(torch.tensor(3)).sum() == 3 assert lengths_to_mask([1, 2, 3]).sum() == 6 assert lengths_to_mask([1, 2, 3])[0].sum() == 1 assert lengths_to_mask([1, 2, 3])[0][0].item() == 1 assert lengths_to_mask(torch.tensor([1, 2, 3]))[0][0].item() == 1 assert lengths_to_mask(torch.tensor([1.0, 2.0, 3.0]))[0][0].item() == 1
def forward( self, input_ids: torch.Tensor, input_lengths: torch.Tensor, **kwargs, ) -> torch.Tensor: # Reduce unnecessary padding. input_ids = input_ids[:, : input_lengths.max()] mask = lengths_to_mask(input_lengths, device=input_ids.device) # Run model. word_embeddings = self.transformer(input_ids, mask)[0] # Pooling Layer sentemb = self.apply_pooling(input_ids, word_embeddings, mask) # Classify return self.classification_head(sentemb)
def _build_seq_eos_mask(self, tokens: torch.Tensor, eos_id=3, curr_pos_in_seq=0): """ маскирует токены, которые идут после eos токена """ current_max_seq_len = tokens.size(1) lengths = [] for seq in tokens: eos_indexes = torch.nonzero(seq == eos_id) if eos_indexes.size(0) == 0: lengths.append(current_max_seq_len) else: current_len = eos_indexes[0, 0] lengths.append(current_len) assert len(lengths) == tokens.size(0) mask: torch.Tensor = lengths_to_mask(lengths, device=tokens.device) return mask
def forward(self, tokens: torch.Tensor, lengths: torch.Tensor) -> Dict[str, torch.Tensor]: """ Encodes a batch of sequences. :param tokens: Torch tensor with the input sequences [batch_size x seq_len]. :param lengths: Torch tensor with the length of each sequence [seq_len]. :return: Dictionary with `sentemb` (tensor with dims [batch_size x output_units]), `wordemb` (tensor with dims [batch_size x seq_len x output_units]), `mask` (input mask), `all_layers` (List with word_embeddings from all layers), `extra` (tuple with all XLM-R layers). """ mask = lengths_to_mask(lengths, device=tokens.device) all_layers = self.model.extract_features(tokens, return_all_hiddens=True) return { "sentemb": all_layers[-1][:, 0, :], "wordemb": all_layers[-1], "all_layers": all_layers, "mask": mask, "extra": (all_layers), }
def prepare_sample( sample: dict, text_encoder: WhitespaceEncoder, label_encoder: LabelEncoder, max_length: int ) -> (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor): """ Function that receives a sample from the Dataset iterator and prepares t he input to feed the transformer model. :param sample: dictionary containing the inputs to build the batch (e.g: [{'source': 'This flight was amazing!', 'target': 'pos'}, {'source': 'I hate Iberia', 'target': 'neg'}]) :param text_encoder: Torch NLP text encoder for tokenization and vectorization. :param label_encoder: Torch NLP label encoder for vectorization of labels. :param max_length: Max length of the input sequences. If a sequence passes that value it is truncated. """ sample = collate_tensors(sample) input_seqs, input_lengths = text_encoder.batch_encode(sample['source']) target_seqs = label_encoder.batch_encode(sample['target']) # Truncate Inputs if input_seqs.size(1) > max_length: input_seqs = input_seqs[:, :max_length] input_mask = lengths_to_mask(input_lengths).unsqueeze(1) return input_seqs, input_mask, target_seqs
def train_manager(configs: dict) -> None: """ Model Training functions. :param configs: Dictionary with the configs defined in default.yaml """ with open('.preprocess.pkl', 'rb') as preprocess_file: text_encoder, train, test = pickle.load(preprocess_file) set_seed(configs.get('seed', 3)) print(f'- nr. of training examples {len(train)}') print(f'- nr. of test examples {len(test)}') print(f'- vocab size: {text_encoder.vocab_size}') # Build Transformer model model = GTransformer(emb_size=configs.get('embedding_size', 128), heads=configs.get('num_heads', 8), depth=configs.get('depth', 6), seq_length=configs.get('max_length', 1000), vocab_size=text_encoder.vocab_size) model.cuda() # Build Optimizer opt = torch.optim.Adam(lr=configs.get('lr', 0.0001), params=model.parameters()) # Training Loop model = train_loop(configs, model, opt, train, test, text_encoder) # Now that the model is trained lets try to see what is the model output! sample = collate_tensors(SAMPLES) src_seqs, src_lengths = text_encoder.batch_encode(sample['source']) src_mask = lengths_to_mask(src_lengths).unsqueeze(1) ys, lengths = greedy_decode(model, src_seqs, src_mask) ys = text_encoder.batch_decode(ys, lengths) for i in range(len(SAMPLES)): print('\nTarget: {}\nModel: {}'.format(SAMPLES[i]['target'], ys[i]))
def forward(self, tokens: torch.Tensor, lengths: torch.Tensor) -> Dict[str, torch.Tensor]: """ Encodes a batch of sequences. :param tokens: Torch tensor with the input sequences [batch_size x seq_len]. :param lengths: Torch tensor with the lenght of each sequence [seq_len]. :return: Dictionary with `sentemb` (tensor with dims [batch_size x output_units]), `wordemb` (tensor with dims [batch_size x seq_len x output_units]), `mask` (input mask), `all_layers` (List with word_embeddings from all layers), `extra` (tuple with the last_hidden_state, the pooler_output representing the entire sentence and the word embeddings for all BERT layers). """ mask = lengths_to_mask(lengths, device=tokens.device) last_hidden_states, pooler_output, all_layers = self.model( tokens, mask) return { "sentemb": pooler_output, "wordemb": last_hidden_states, "all_layers": all_layers, "mask": mask, "extra": (last_hidden_states, pooler_output, all_layers), }
def forward(self, tokens: torch.Tensor, lengths: torch.Tensor, **kwargs) -> Dict[str, torch.Tensor]: """ Encodes a batch of sequences. :param tokens: Torch tensor with the input sequences [batch_size x seq_len]. :param lengths: Torch tensor with the lenght of each sequence [seq_len]. :return: Dictionary with `sentemb` (tensor with dims [batch_size x output_units]), `wordemb` (tensor with dims [batch_size x seq_len x output_units]), `mask` (input mask), `all_layers` (List with word_embeddings from all layers, `extra` (tuple with the LSTM outputs, hidden states and cell states). """ self.lstm.flatten_parameters( ) # Is it required? should this be in the __init__? tokens, lengths, unsorted_idx = sort_sequences(tokens, lengths) if self.left_pad: # convert left-padding to right-padding tokens = convert_padding_direction( tokens, self.padding_idx, left_to_right=True, ) bsz, seqlen = tokens.size() # embed tokens x = self.embed_tokens(tokens) # B x T x C -> T x B x C x = x.transpose(0, 1) # pack embedded source tokens into a PackedSequence packed_x = nn.utils.rnn.pack_padded_sequence(x, lengths.data.tolist()) # apply LSTM if self.bidirectional: state_size = 2 * self._n_layers, bsz, self.hidden_size else: state_size = self._n_layers, bsz, self.hidden_size h0 = x.data.new(*state_size).zero_() c0 = x.data.new(*state_size).zero_() packed_outs, (final_hiddens, final_cells) = self.lstm(packed_x, (h0, c0)) # unpack outputs and apply dropout x, _ = nn.utils.rnn.pad_packed_sequence( packed_outs, padding_value=self.padding_value) assert list(x.size()) == [seqlen, bsz, self.output_units] word_embeddings = x if self.bidirectional: def combine_bidir(outs): return torch.cat( [ torch.cat([outs[2 * i], outs[2 * i + 1]], dim=0).view( 1, bsz, self.output_units) for i in range(self._n_layers) ], dim=0, ) final_hiddens = combine_bidir(final_hiddens) final_cells = combine_bidir(final_cells) encoder_padding_mask = tokens.eq(self.padding_idx).t() # Set padded outputs to -inf so they are not selected by max-pooling padding_mask = tokens.eq(self.padding_idx).t().unsqueeze(-1) if padding_mask.any(): x = x.float().masked_fill_(padding_mask, float("-inf")).type_as(x) # Build the sentence embedding by max-pooling over the encoder outputs sentemb = x.max(dim=0)[0] model_out = self.reorder_output( encoder_out={ "sentemb": sentemb, "extra": (word_embeddings, final_hiddens, final_cells), }, new_order=unsorted_idx, ) model_out["mask"] = lengths_to_mask(lengths, device=tokens.device) model_out["wordemb"] = model_out["extra"][0].transpose(0, 1) model_out["all_layers"] = [model_out["wordemb"]] return model_out