def prepare_sample(self, sample: list, prepare_target: bool = True) -> (dict, dict): """ Function that prepares a sample to input the model. :param sample: list of dictionaries. Returns: - dictionary with the expected model inputs. - dictionary with the expected target labels. """ sample = collate_tensors(sample) tokens, lengths = self.tokenizer.batch_encode(sample["text"]) inputs = {"tokens": tokens, "lengths": lengths} if not prepare_target: return inputs, {} # Prepare target: try: targets = { "labels": self.data.label_encoder.batch_encode(sample["label"]) } return inputs, targets except RuntimeError: raise Exception("Label encoder found an unknown label.")
def prepare_sample(self, sample: list, prepare_target: bool = True) -> (dict, dict): """ Function that prepares a sample to input the model. :param sample: list of dictionaries. :param prepare_target: :return: - dictionary with the expected model inputs. - dictionary with the expected target labels. """ sample = collate_tensors(sample) tokens, lengths = self.tokenizer(sample['text'], return_tensors='pt', padding=True, return_length=True, return_token_type_ids=False, return_attention_mask=False, truncation='only_first', max_length=512) inputs = {"tokens": tokens, "lengths": lengths} if not prepare_target: return inputs, {} # Prepare target: try: targets = { 'labels': self.data.label_encoder.batch_encode(sample["label"]) } return inputs, targets except RuntimeError: raise Exception("Label encoder found an unknown label.")
def prepare_sample( self, sample: List[Dict[str, Union[str, float]]], inference: bool = False ) -> Union[Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], Dict[ str, torch.Tensor]]: """ Function that prepares a sample to input the model. :param sample: list of dictionaries. :param inference: If set to true prepares only the model inputs. :returns: Tuple with 2 dictionaries (model inputs and targets). If `inference=True` returns only the model inputs. """ sample = collate_tensors(sample) mt_inputs = self.encoder.prepare_sample(sample["mt"]) src_inputs = self.encoder.prepare_sample(sample["src"]) mt_inputs = {"mt_" + k: v for k, v in mt_inputs.items()} src_inputs = {"src_" + k: v for k, v in src_inputs.items()} inputs = {**mt_inputs, **src_inputs} if inference: return inputs targets = {"score": torch.tensor(sample["score"], dtype=torch.float)} return inputs, targets
def prepare_sample(sample: list, tokenizer, prepare_target: bool = True) -> (dict, dict): """ Function that prepares a sample to input the model. :param sample: list of dictionaries. Returns: - dictionary with the expected model inputs. - dictionary with the expected target labels. """ sample = collate_tensors(sample) slist = [] for seq in sample["seq"]: seqstr = list(seq) slist.append(seqstr) token_lens = [] for s in slist: tokens = tokenizer.encode(s) print(tokens) token_lens.append(len(tokens)) print(token_lens) sns.distplot(token_lens) plt.xlabel('Token count') plt.savefig('{}.png'.format('token_count'), bbox_inches='tight') ids = tokenizer.batch_encode_plus(slist, add_special_tokens=False, padding=True, truncation=True, max_length=2000) return ids
def prepare_sample( sample: list, tokenizer, label_encoder, prepare_target: bool = True ) -> (dict, dict): sample = collate_tensors(sample) tokens, lengths = tokenizer.batch_encode(sample["text"]) inputs = {"tokens": tokens, "lengths": lengths} if not prepare_target: targets = {} else: targets = {"labels": label_encoder.batch_encode(sample["label"])} return inputs, targets
def prepare_sample(self, sample: list) -> (dict): """ Function that prepares a sample to input the model. :param sample: list of dictionaries. Returns: - dictionary with the model inputs. """ sample = collate_tensors(sample) tokens, lengths = self.tokenizer.batch_encode(sample["text"]) inputs = {"tokens": tokens} return inputs
def prepare_sample(self, sample: list) -> (dict, dict): """ Function that prepares a sample to input the model. :param sample: list of dictionaries. Returns: - dictionary with the expected model inputs. - dictionary with the expected target values (e.g. HTER score). """ sample = collate_tensors(sample) sample = self.encoder.prepare_sample(sample["text"], trackpos=False) tokens, labels = mask_tokens( sample["tokens"], self.encoder.tokenizer, self.hparams.mlm_probability, ) return {"tokens": tokens, "lengths": sample["lengths"]}, {"lm_labels": labels}
def __collate_fn(self, sample: list, prepare_target=True): """ torch.utils.Dataloader collate_fn change layout of data from list of dicts to dict of tensors [ {text: 'a', label:'0'} {text: 'b', label:'1'} {text: 'c', label:'2'} ] to { text: ['a', 'b', 'c'], label:[0,1,2] } and encode tokens to its ids in vocab, do also 0 padding """ # sort in reverse order, need for packed sequence sorted_sample = sorted(sample, key=lambda x: -len(x["incorrect"])) collate_sample = collate_tensors( sorted_sample, stack_tensors=stack_and_pad_tensors ) ### todo: do wymiany src_tokens, src_lengths = self.tokenizer.batch_encode( collate_sample["incorrect"] ) # cant change layout here, becaure when use distributeddataloader (multi-gpu) it will # divide first dim by the number of gpus, # change from [batch, seq_len] -> to [seq_len, batch] # src_tokens = src_tokens.transpose(0, 1) inputs = {"src_ids": src_tokens, "src_lengths": src_lengths} ### todo: do wymiany ### encode tokens based on vocab trg_tokens, trg_lengths = self.tokenizer.batch_encode(collate_sample["correct"]) # change from [batch, seq_len] -> to [seq_len, batch] # trg_tokens = trg_tokens.transpose(0, 1) targets = {"trg_ids": trg_tokens, "trg_lengths": trg_lengths} return inputs, targets
def prepare_sample(self, sample: list, prepare_target: bool = True) -> (dict, dict): """ Function that prepares a sample to input the model. :param sample: list of dictionaries. Returns: - dictionary with the expected model inputs. - dictionary with the expected target values. """ sample = collate_tensors(sample) inputs = self.encoder.prepare_sample(sample["text"], trackpos=True) if not prepare_target: return inputs, {} tags, _ = stack_and_pad_tensors( [ self.label_encoder.batch_encode(tags.split()) for tags in sample["tags"] ], padding_index=self.label_encoder.vocab_size, ) if self.hparams.ignore_first_title: first_tokens = tags[:, 0].clone() tags[:, 0] = first_tokens.masked_fill_( first_tokens == self._label_encoder.token_to_index["T"], self.label_encoder.vocab_size, ) # TODO is this still needed ? if self.hparams.ignore_last_tag: lengths = [len(tags.split()) for tags in sample["tags"]] lengths = np.asarray(lengths) k = 0 for length in lengths: if tags[k][length - 1] == 1: tags[k][length - 1] = self.label_encoder.vocab_size k += 1 targets = {"tags": tags} return inputs, targets
def prepare_sample( sample: dict, text_encoder: WhitespaceEncoder ) -> (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor): """ Function that receives a sample from the Dataset iterator and prepares t he input to feed the transformer model. :param sample: dictionary containing the inputs to build the batch (e.g: [{'source': '9 0', 'target': '0 9'}, {'source': '34 3 4', 'target': '4 3 34'}]) :param text_encoder: Torch NLP text encoder for tokenization and vectorization. """ sample = collate_tensors(sample) input_seqs, input_lengths = text_encoder.batch_encode(sample['source']) target_seqs, target_lengths = text_encoder.batch_encode(sample['target']) # bos tokens to initialize decoder bos_tokens = torch.full([target_seqs.size(0), 1], text_encoder.stoi['<s>'], dtype=torch.long) shifted_target = torch.cat((bos_tokens, target_seqs[:, :-1]), dim=1) return input_seqs, input_lengths, target_seqs, shifted_target, target_lengths
def prepare_sample( self, sample: List[Dict[str, Union[str, float]]], inference: bool = False ) -> Union[Tuple[Dict[str, torch.Tensor], None], List[Dict[str, torch.Tensor]]]: """ Function that prepares a sample to input the model. :param sample: list of dictionaries. :param inference: If set to to False, then the model expects a MT and reference instead of anchor, pos, and neg segments. :return: Tuple with a dictionary containing the model inputs and None OR List with source, MT and reference tokenized and vectorized. """ sample = collate_tensors(sample) if inference: src_inputs = self.encoder.prepare_sample(sample["src"]) mt_inputs = self.encoder.prepare_sample(sample["mt"]) ref_inputs = self.encoder.prepare_sample(sample["ref"]) alt_inputs = (self.encoder.prepare_sample(sample["alt"]) if "alt" in sample else None) return src_inputs, mt_inputs, ref_inputs, alt_inputs ref_inputs = self.encoder.prepare_sample(sample["ref"]) src_inputs = self.encoder.prepare_sample(sample["src"]) pos_inputs = self.encoder.prepare_sample(sample["pos"]) neg_inputs = self.encoder.prepare_sample(sample["neg"]) ref_inputs = {"ref_" + k: v for k, v in ref_inputs.items()} src_inputs = {"src_" + k: v for k, v in src_inputs.items()} pos_inputs = {"pos_" + k: v for k, v in pos_inputs.items()} neg_inputs = {"neg_" + k: v for k, v in neg_inputs.items()} return { **ref_inputs, **src_inputs, **pos_inputs, **neg_inputs }, torch.empty(0)
def prepare_sample(self, sample: list, prepare_target: bool = True) -> (dict, dict): """ Function that prepares a sample to input the model. :param sample: list of dictionaries. Returns: - dictionary with the expected model inputs. - dictionary with the expected target labels. """ sample = collate_tensors(sample) # Tokenize the input, return dict with 3 entries: # input_ids: tokenized matrix # token_input_id: matrix of 0,1 indicating if the element belongs to seq0 or eq1 # attention_mask: matrix of 0,1 indicating if a token ist masked (0) or not (1) # Convert to PT tensor inputs = self.tokenizer.batch_encode_plus( sample["seq"], add_special_tokens=True, padding=True, truncation=True, max_length=self.sequence_length, return_tensors="pt") if prepare_target is False: return inputs, {} # Prepare target: try: targets = { "labels": self.label_encoder.batch_encode(sample["label"]) } return inputs, targets except RuntimeError: print(sample["label"]) raise Exception("Label encoder found an unknown label.")
def prepare_sample( sample: dict, text_encoder: WhitespaceEncoder, label_encoder: LabelEncoder, max_length: int ) -> (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor): """ Function that receives a sample from the Dataset iterator and prepares t he input to feed the transformer model. :param sample: dictionary containing the inputs to build the batch (e.g: [{'source': 'This flight was amazing!', 'target': 'pos'}, {'source': 'I hate Iberia', 'target': 'neg'}]) :param text_encoder: Torch NLP text encoder for tokenization and vectorization. :param label_encoder: Torch NLP label encoder for vectorization of labels. :param max_length: Max length of the input sequences. If a sequence passes that value it is truncated. """ sample = collate_tensors(sample) input_seqs, input_lengths = text_encoder.batch_encode(sample['source']) target_seqs = label_encoder.batch_encode(sample['target']) # Truncate Inputs if input_seqs.size(1) > max_length: input_seqs = input_seqs[:, :max_length] input_mask = lengths_to_mask(input_lengths).unsqueeze(1) return input_seqs, input_mask, target_seqs
def train_manager(configs: dict) -> None: """ Model Training functions. :param configs: Dictionary with the configs defined in default.yaml """ with open('.preprocess.pkl', 'rb') as preprocess_file: text_encoder, train, test = pickle.load(preprocess_file) set_seed(configs.get('seed', 3)) print(f'- nr. of training examples {len(train)}') print(f'- nr. of test examples {len(test)}') print(f'- vocab size: {text_encoder.vocab_size}') # Build Transformer model model = GTransformer(emb_size=configs.get('embedding_size', 128), heads=configs.get('num_heads', 8), depth=configs.get('depth', 6), seq_length=configs.get('max_length', 1000), vocab_size=text_encoder.vocab_size) model.cuda() # Build Optimizer opt = torch.optim.Adam(lr=configs.get('lr', 0.0001), params=model.parameters()) # Training Loop model = train_loop(configs, model, opt, train, test, text_encoder) # Now that the model is trained lets try to see what is the model output! sample = collate_tensors(SAMPLES) src_seqs, src_lengths = text_encoder.batch_encode(sample['source']) src_mask = lengths_to_mask(src_lengths).unsqueeze(1) ys, lengths = greedy_decode(model, src_seqs, src_mask) ys = text_encoder.batch_decode(ys, lengths) for i in range(len(SAMPLES)): print('\nTarget: {}\nModel: {}'.format(SAMPLES[i]['target'], ys[i]))
import xgboost as xgb import pandas as pd from torchnlp.encoders.text import WhitespaceEncoder from torchnlp.samplers import BucketBatchSampler from torchnlp.utils import collate_tensors from torchnlp.encoders.text import stack_and_pad_tensors from torchnlp.nn import LockedDropout loaded_data = ["now this ain't funny", "so don't you dare laugh"] encoder = WhitespaceEncoder(loaded_data) encoded_data = [encoder.encode(example) for example in loaded_data] print("encoded_data", encoded_data) encoded_data = [torch.randn(2), torch.randn(3), torch.randn(4), torch.randn(5)] train_sampler = torch.utils.data.sampler.SequentialSampler(encoded_data) train_batch_sampler = BucketBatchSampler( train_sampler, batch_size=2, drop_last=False, sort_key=lambda i: encoded_data[i].shape[0]) batches = [[encoded_data[i] for i in batch] for batch in train_batch_sampler] batches = [ collate_tensors(batch, stack_tensors=stack_and_pad_tensors) for batch in batches ] print("batches=", batches)