Example #1
0
    def prepare_sample(self,
                       sample: list,
                       prepare_target: bool = True) -> (dict, dict):
        """
        Function that prepares a sample to input the model.
        :param sample: list of dictionaries.
        
        Returns:
            - dictionary with the expected model inputs.
            - dictionary with the expected target labels.
        """
        sample = collate_tensors(sample)

        tokens, lengths = self.tokenizer.batch_encode(sample["text"])

        inputs = {"tokens": tokens, "lengths": lengths}

        if not prepare_target:
            return inputs, {}

        # Prepare target:
        try:
            targets = {
                "labels": self.data.label_encoder.batch_encode(sample["label"])
            }
            return inputs, targets
        except RuntimeError:
            raise Exception("Label encoder found an unknown label.")
Example #2
0
    def prepare_sample(self,
                       sample: list,
                       prepare_target: bool = True) -> (dict, dict):
        """
        Function that prepares a sample to input the model.
        :param sample: list of dictionaries.
        :param prepare_target:
        :return:
            - dictionary with the expected model inputs.
            - dictionary with the expected target labels.
        """
        sample = collate_tensors(sample)
        tokens, lengths = self.tokenizer(sample['text'],
                                         return_tensors='pt',
                                         padding=True,
                                         return_length=True,
                                         return_token_type_ids=False,
                                         return_attention_mask=False,
                                         truncation='only_first',
                                         max_length=512)

        inputs = {"tokens": tokens, "lengths": lengths}

        if not prepare_target:
            return inputs, {}

        # Prepare target:
        try:
            targets = {
                'labels': self.data.label_encoder.batch_encode(sample["label"])
            }
            return inputs, targets
        except RuntimeError:
            raise Exception("Label encoder found an unknown label.")
Example #3
0
    def prepare_sample(
        self,
        sample: List[Dict[str, Union[str, float]]],
        inference: bool = False
    ) -> Union[Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], Dict[
            str, torch.Tensor]]:
        """
        Function that prepares a sample to input the model.
        :param sample: list of dictionaries.
        :param inference: If set to true prepares only the model inputs.

        :returns: Tuple with 2 dictionaries (model inputs and targets). 
            If `inference=True` returns only the model inputs.
        """
        sample = collate_tensors(sample)
        mt_inputs = self.encoder.prepare_sample(sample["mt"])
        src_inputs = self.encoder.prepare_sample(sample["src"])

        mt_inputs = {"mt_" + k: v for k, v in mt_inputs.items()}
        src_inputs = {"src_" + k: v for k, v in src_inputs.items()}

        inputs = {**mt_inputs, **src_inputs}

        if inference:
            return inputs

        targets = {"score": torch.tensor(sample["score"], dtype=torch.float)}
        return inputs, targets
Example #4
0
def prepare_sample(sample: list,
                   tokenizer,
                   prepare_target: bool = True) -> (dict, dict):
    """
    Function that prepares a sample to input the model.        
    :param sample: list of dictionaries.
        
    Returns:
            - dictionary with the expected model inputs.
            - dictionary with the expected target labels.
    """
    sample = collate_tensors(sample)

    slist = []
    for seq in sample["seq"]:
        seqstr = list(seq)
        slist.append(seqstr)

    token_lens = []
    for s in slist:
        tokens = tokenizer.encode(s)
        print(tokens)
        token_lens.append(len(tokens))

    print(token_lens)
    sns.distplot(token_lens)
    plt.xlabel('Token count')
    plt.savefig('{}.png'.format('token_count'), bbox_inches='tight')

    ids = tokenizer.batch_encode_plus(slist,
                                      add_special_tokens=False,
                                      padding=True,
                                      truncation=True,
                                      max_length=2000)
    return ids
Example #5
0
def prepare_sample(
    sample: list, tokenizer, label_encoder, prepare_target: bool = True
) -> (dict, dict):
    sample = collate_tensors(sample)
    tokens, lengths = tokenizer.batch_encode(sample["text"])

    inputs = {"tokens": tokens, "lengths": lengths}

    if not prepare_target:
        targets = {}
    else:
        targets = {"labels": label_encoder.batch_encode(sample["label"])}
    return inputs, targets
Example #6
0
    def prepare_sample(self, sample: list) -> (dict):
        """
        Function that prepares a sample to input the model.
        :param sample: list of dictionaries.
        
        Returns:
            - dictionary with the model inputs.
        """
        sample = collate_tensors(sample)
        tokens, lengths = self.tokenizer.batch_encode(sample["text"])
        inputs = {"tokens": tokens}

        return inputs
Example #7
0
 def prepare_sample(self, sample: list) -> (dict, dict):
     """
     Function that prepares a sample to input the model.
     :param sample: list of dictionaries.
     
     Returns:
         - dictionary with the expected model inputs.
         - dictionary with the expected target values (e.g. HTER score).
     """
     sample = collate_tensors(sample)
     sample = self.encoder.prepare_sample(sample["text"], trackpos=False)
     tokens, labels = mask_tokens(
         sample["tokens"], self.encoder.tokenizer, self.hparams.mlm_probability,
     )
     return {"tokens": tokens, "lengths": sample["lengths"]}, {"lm_labels": labels}
Example #8
0
    def __collate_fn(self, sample: list, prepare_target=True):
        """
        torch.utils.Dataloader collate_fn

        change layout of data from list of dicts to dict of tensors
         [
           {text: 'a', label:'0'}
           {text: 'b', label:'1'}
           {text: 'c', label:'2'}
         ]
         to
         { text: ['a', 'b', 'c'], label:[0,1,2] }

         and encode tokens to its ids in vocab, do also 0 padding
        """

        # sort in reverse order, need for packed sequence

        sorted_sample = sorted(sample, key=lambda x: -len(x["incorrect"]))

        collate_sample = collate_tensors(
            sorted_sample, stack_tensors=stack_and_pad_tensors
        )

        ### todo: do wymiany
        src_tokens, src_lengths = self.tokenizer.batch_encode(
            collate_sample["incorrect"]
        )

        # cant change layout here, becaure when use distributeddataloader (multi-gpu) it will
        # divide first dim by the number of gpus,
        # change from [batch, seq_len] -> to [seq_len, batch]
        # src_tokens = src_tokens.transpose(0, 1)

        inputs = {"src_ids": src_tokens, "src_lengths": src_lengths}

        ### todo: do wymiany
        ### encode tokens based on vocab
        trg_tokens, trg_lengths = self.tokenizer.batch_encode(collate_sample["correct"])

        # change from [batch, seq_len] -> to [seq_len, batch]
        # trg_tokens = trg_tokens.transpose(0, 1)
        targets = {"trg_ids": trg_tokens, "trg_lengths": trg_lengths}

        return inputs, targets
Example #9
0
    def prepare_sample(self,
                       sample: list,
                       prepare_target: bool = True) -> (dict, dict):
        """
        Function that prepares a sample to input the model.
        :param sample: list of dictionaries.
        
        Returns:
            - dictionary with the expected model inputs.
            - dictionary with the expected target values.
        """
        sample = collate_tensors(sample)
        inputs = self.encoder.prepare_sample(sample["text"], trackpos=True)
        if not prepare_target:
            return inputs, {}

        tags, _ = stack_and_pad_tensors(
            [
                self.label_encoder.batch_encode(tags.split())
                for tags in sample["tags"]
            ],
            padding_index=self.label_encoder.vocab_size,
        )

        if self.hparams.ignore_first_title:
            first_tokens = tags[:, 0].clone()
            tags[:, 0] = first_tokens.masked_fill_(
                first_tokens == self._label_encoder.token_to_index["T"],
                self.label_encoder.vocab_size,
            )

        # TODO is this still needed ?
        if self.hparams.ignore_last_tag:
            lengths = [len(tags.split()) for tags in sample["tags"]]
            lengths = np.asarray(lengths)
            k = 0
            for length in lengths:
                if tags[k][length - 1] == 1:
                    tags[k][length - 1] = self.label_encoder.vocab_size
                k += 1

        targets = {"tags": tags}
        return inputs, targets
Example #10
0
def prepare_sample(
    sample: dict, text_encoder: WhitespaceEncoder
) -> (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor):
    """
    Function that receives a sample from the Dataset iterator and prepares t
    he input to feed the transformer model.

    :param sample: dictionary containing the inputs to build the batch 
        (e.g: [{'source': '9 0', 'target': '0 9'}, {'source': '34 3 4', 'target': '4 3 34'}])
    :param text_encoder: Torch NLP text encoder for tokenization and vectorization.
    """
    sample = collate_tensors(sample)
    input_seqs, input_lengths = text_encoder.batch_encode(sample['source'])
    target_seqs, target_lengths = text_encoder.batch_encode(sample['target'])
    # bos tokens to initialize decoder
    bos_tokens = torch.full([target_seqs.size(0), 1],
                            text_encoder.stoi['<s>'],
                            dtype=torch.long)
    shifted_target = torch.cat((bos_tokens, target_seqs[:, :-1]), dim=1)
    return input_seqs, input_lengths, target_seqs, shifted_target, target_lengths
Example #11
0
    def prepare_sample(
        self,
        sample: List[Dict[str, Union[str, float]]],
        inference: bool = False
    ) -> Union[Tuple[Dict[str, torch.Tensor], None], List[Dict[str,
                                                               torch.Tensor]]]:
        """
        Function that prepares a sample to input the model.
        
        :param sample: list of dictionaries.
        :param inference: If set to to False, then the model expects 
            a MT and reference instead of anchor, pos, and neg segments.

        :return: Tuple with a dictionary containing the model inputs and None OR List 
            with source, MT and reference tokenized and vectorized.
        """
        sample = collate_tensors(sample)
        if inference:
            src_inputs = self.encoder.prepare_sample(sample["src"])
            mt_inputs = self.encoder.prepare_sample(sample["mt"])
            ref_inputs = self.encoder.prepare_sample(sample["ref"])
            alt_inputs = (self.encoder.prepare_sample(sample["alt"])
                          if "alt" in sample else None)
            return src_inputs, mt_inputs, ref_inputs, alt_inputs

        ref_inputs = self.encoder.prepare_sample(sample["ref"])
        src_inputs = self.encoder.prepare_sample(sample["src"])
        pos_inputs = self.encoder.prepare_sample(sample["pos"])
        neg_inputs = self.encoder.prepare_sample(sample["neg"])

        ref_inputs = {"ref_" + k: v for k, v in ref_inputs.items()}
        src_inputs = {"src_" + k: v for k, v in src_inputs.items()}
        pos_inputs = {"pos_" + k: v for k, v in pos_inputs.items()}
        neg_inputs = {"neg_" + k: v for k, v in neg_inputs.items()}

        return {
            **ref_inputs,
            **src_inputs,
            **pos_inputs,
            **neg_inputs
        }, torch.empty(0)
Example #12
0
    def prepare_sample(self,
                       sample: list,
                       prepare_target: bool = True) -> (dict, dict):
        """
        Function that prepares a sample to input the model.
        :param sample: list of dictionaries.

        Returns:
            - dictionary with the expected model inputs.
            - dictionary with the expected target labels.
        """
        sample = collate_tensors(sample)

        # Tokenize the input, return dict with 3 entries:
        #   input_ids: tokenized matrix
        #   token_input_id: matrix of 0,1 indicating if the element belongs to seq0 or eq1
        #   attention_mask: matrix of 0,1 indicating if a token ist masked (0) or not (1)
        # Convert to PT tensor
        inputs = self.tokenizer.batch_encode_plus(
            sample["seq"],
            add_special_tokens=True,
            padding=True,
            truncation=True,
            max_length=self.sequence_length,
            return_tensors="pt")

        if prepare_target is False:
            return inputs, {}

        # Prepare target:
        try:
            targets = {
                "labels": self.label_encoder.batch_encode(sample["label"])
            }
            return inputs, targets
        except RuntimeError:
            print(sample["label"])
            raise Exception("Label encoder found an unknown label.")
def prepare_sample(
    sample: dict, text_encoder: WhitespaceEncoder, label_encoder: LabelEncoder,
    max_length: int
) -> (torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor):
    """
    Function that receives a sample from the Dataset iterator and prepares t
    he input to feed the transformer model.
    :param sample: dictionary containing the inputs to build the batch 
        (e.g: [{'source': 'This flight was amazing!', 'target': 'pos'}, 
               {'source': 'I hate Iberia', 'target': 'neg'}])
    :param text_encoder: Torch NLP text encoder for tokenization and vectorization.
    :param label_encoder: Torch NLP label encoder for vectorization of labels.
    :param max_length: Max length of the input sequences.
         If a sequence passes that value it is truncated.
    """
    sample = collate_tensors(sample)
    input_seqs, input_lengths = text_encoder.batch_encode(sample['source'])
    target_seqs = label_encoder.batch_encode(sample['target'])
    # Truncate Inputs
    if input_seqs.size(1) > max_length:
        input_seqs = input_seqs[:, :max_length]
    input_mask = lengths_to_mask(input_lengths).unsqueeze(1)
    return input_seqs, input_mask, target_seqs
def train_manager(configs: dict) -> None:
    """
    Model Training functions.
    :param configs: Dictionary with the configs defined in default.yaml
    """
    with open('.preprocess.pkl', 'rb') as preprocess_file:
        text_encoder, train, test = pickle.load(preprocess_file)

    set_seed(configs.get('seed', 3))
    print(f'- nr. of training examples {len(train)}')
    print(f'- nr. of test examples {len(test)}')
    print(f'- vocab size: {text_encoder.vocab_size}')

    # Build Transformer model
    model = GTransformer(emb_size=configs.get('embedding_size', 128),
                         heads=configs.get('num_heads', 8),
                         depth=configs.get('depth', 6),
                         seq_length=configs.get('max_length', 1000),
                         vocab_size=text_encoder.vocab_size)
    model.cuda()

    # Build Optimizer
    opt = torch.optim.Adam(lr=configs.get('lr', 0.0001),
                           params=model.parameters())

    # Training Loop
    model = train_loop(configs, model, opt, train, test, text_encoder)

    # Now that the model is trained lets try to see what is the model output!
    sample = collate_tensors(SAMPLES)
    src_seqs, src_lengths = text_encoder.batch_encode(sample['source'])
    src_mask = lengths_to_mask(src_lengths).unsqueeze(1)
    ys, lengths = greedy_decode(model, src_seqs, src_mask)
    ys = text_encoder.batch_decode(ys, lengths)
    for i in range(len(SAMPLES)):
        print('\nTarget: {}\nModel:  {}'.format(SAMPLES[i]['target'], ys[i]))
Example #15
0
import xgboost as xgb
import pandas as pd
from torchnlp.encoders.text import WhitespaceEncoder
from torchnlp.samplers import BucketBatchSampler
from torchnlp.utils import collate_tensors
from torchnlp.encoders.text import stack_and_pad_tensors
from torchnlp.nn import LockedDropout

loaded_data = ["now this ain't funny", "so don't you dare laugh"]
encoder = WhitespaceEncoder(loaded_data)
encoded_data = [encoder.encode(example) for example in loaded_data]

print("encoded_data", encoded_data)

encoded_data = [torch.randn(2), torch.randn(3), torch.randn(4), torch.randn(5)]

train_sampler = torch.utils.data.sampler.SequentialSampler(encoded_data)
train_batch_sampler = BucketBatchSampler(
    train_sampler,
    batch_size=2,
    drop_last=False,
    sort_key=lambda i: encoded_data[i].shape[0])

batches = [[encoded_data[i] for i in batch] for batch in train_batch_sampler]
batches = [
    collate_tensors(batch, stack_tensors=stack_and_pad_tensors)
    for batch in batches
]

print("batches=", batches)