Example #1
0
    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path):
        assert os.path.isfile(file_path)
        logger.info("Creating features from dataset file at %s", file_path)

        cache_fn = f'{file_path}.cache'
        if args.cache_data and os.path.isfile(
                cache_fn) and not args.overwrite_cache:
            logger.info("Loading cached data from %s", cache_fn)
            self.examples = torch.load(cache_fn)
        else:
            self.examples = []
            with open(file_path, encoding="utf-8") as f:
                for line in f.readlines():
                    if len(line) > 0 and not line.isspace() and len(
                            line.split(' ||| ')) == 2:
                        try:
                            src, tgt = line.split(' ||| ')
                            if src.rstrip() == '' or tgt.rstrip() == '':
                                continue
                        except:
                            logger.info("Skipping instance %s", line)
                            continue
                        sent_src, sent_tgt = src.strip().split(), tgt.strip(
                        ).split()
                        token_src, token_tgt = [
                            tokenizer.tokenize(word) for word in sent_src
                        ], [tokenizer.tokenize(word) for word in sent_tgt]
                        wid_src, wid_tgt = [
                            tokenizer.convert_tokens_to_ids(x)
                            for x in token_src
                        ], [
                            tokenizer.convert_tokens_to_ids(x)
                            for x in token_tgt
                        ]

                        ids_src, ids_tgt = tokenizer.prepare_for_model(
                            list(itertools.chain(*wid_src)),
                            return_tensors='pt',
                            max_length=tokenizer.max_len
                        )['input_ids'], tokenizer.prepare_for_model(
                            list(itertools.chain(*wid_tgt)),
                            return_tensors='pt',
                            max_length=tokenizer.max_len)['input_ids']

                        bpe2word_map_src = []
                        for i, word_list in enumerate(token_src):
                            bpe2word_map_src += [i for x in word_list]
                        bpe2word_map_tgt = []
                        for i, word_list in enumerate(token_tgt):
                            bpe2word_map_tgt += [i for x in word_list]

                        self.examples.append(
                            (ids_src, ids_tgt, bpe2word_map_src,
                             bpe2word_map_tgt))

            if args.cache_data:
                logger.info("Saving cached data to %s", cache_fn)
                torch.save(self.examples, cache_fn)
Example #2
0
def mask_tokens(inputs: torch.Tensor,
                tokenizer: PreTrainedTokenizer,
                args,
                langid_mask=None,
                lang_id=None) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
    """

    if tokenizer.mask_token is None:
        raise ValueError(
            "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
        )

    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args.mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
        for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask,
                                                 dtype=torch.uint8),
                                    value=0.0)
    if tokenizer._pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)

    if langid_mask is not None:
        padding_mask = langid_mask.eq(lang_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)

    masked_indices = torch.bernoulli(probability_matrix).byte()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape,
                                                  0.8)).byte() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(
        tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(
        labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer),
                                 labels.shape,
                                 dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels
Example #3
0
    def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path):
        assert os.path.isfile(file_path)
        print('Loading the dataset...')
        self.examples = []
        with open(file_path, encoding="utf-8") as f:
            for idx, line in enumerate(f.readlines()):
                if len(line) == 0 or line.isspace() or not len(
                        line.split(' ||| ')) == 2:
                    raise ValueError(
                        f'Line {idx+1} is not in the correct format!')

                src, tgt = line.split(' ||| ')
                if src.rstrip() == '' or tgt.rstrip() == '':
                    raise ValueError(
                        f'Line {idx+1} is not in the correct format!')

                sent_src, sent_tgt = src.strip().split(), tgt.strip().split()
                token_src, token_tgt = [
                    tokenizer.tokenize(word) for word in sent_src
                ], [tokenizer.tokenize(word) for word in sent_tgt]
                wid_src, wid_tgt = [
                    tokenizer.convert_tokens_to_ids(x) for x in token_src
                ], [tokenizer.convert_tokens_to_ids(x) for x in token_tgt]

                ids_src, ids_tgt = tokenizer.prepare_for_model(
                    list(itertools.chain(*wid_src)),
                    return_tensors='pt',
                    max_length=tokenizer.max_len
                )['input_ids'], tokenizer.prepare_for_model(
                    list(itertools.chain(*wid_tgt)),
                    return_tensors='pt',
                    max_length=tokenizer.max_len)['input_ids']
                if len(ids_src[0]) == 2 or len(ids_tgt[0]) == 2:
                    raise ValueError(
                        f'Line {idx+1} is not in the correct format!')

                bpe2word_map_src = []
                for i, word_list in enumerate(token_src):
                    bpe2word_map_src += [i for x in word_list]
                bpe2word_map_tgt = []
                for i, word_list in enumerate(token_tgt):
                    bpe2word_map_tgt += [i for x in word_list]

                self.examples.append((ids_src[0], ids_tgt[0], bpe2word_map_src,
                                      bpe2word_map_tgt))