Example #1
0
 def test_space(self):
     # for example, character models treat space as a symbol
     dict_file = io.StringIO("  999\n" "a 999\n" "b 999\n")
     d = Dictionary()
     d.add_from_file(dict_file)
     self.assertEqual(d.index(" "), 4)
     self.assertEqual(d.index("a"), 5)
     self.assertEqual(d.index("b"), 6)
Example #2
0
    def test_character_token_embedder(self):
        vocab = Dictionary()
        vocab.add_symbol('hello')
        vocab.add_symbol('there')

        embedder = CharacterTokenEmbedder(vocab, [(2, 16), (4, 32), (8, 64),
                                                  (16, 2)], 64, 5, 2)

        test_sents = [['hello', 'unk', 'there'], ['there'], ['hello', 'there']]
        max_len = max(len(s) for s in test_sents)
        input = torch.LongTensor(len(test_sents),
                                 max_len + 2).fill_(vocab.pad())
        for i in range(len(test_sents)):
            input[i][0] = vocab.eos()
            for j in range(len(test_sents[i])):
                input[i][j + 1] = vocab.index(test_sents[i][j])
            input[i][j + 2] = vocab.eos()
        embs = embedder(input)

        assert embs.size() == (len(test_sents), max_len + 2, 5)
        self.assertAlmostEqual(embs[0][0], embs[1][0])
        self.assertAlmostEqual(embs[0][0], embs[0][-1])
        self.assertAlmostEqual(embs[0][1], embs[2][1])
        self.assertAlmostEqual(embs[0][3], embs[1][1])

        embs.sum().backward()
        assert embedder.char_embeddings.weight.grad is not None
Example #3
0
    def _get_test_data(self):
        vocab = Dictionary()
        vocab.add_symbol("he@@")
        vocab.add_symbol("llo")
        vocab.add_symbol("how")
        vocab.add_symbol("are")
        vocab.add_symbol("y@@")
        vocab.add_symbol("ou")
        vocab.add_symbol("n@@")
        vocab.add_symbol("ew")
        vocab.add_symbol("or@@")
        vocab.add_symbol("k")

        src_tokens = [
            ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
            ["how", "are", "y@@", "ou"],
        ]
        src_len = [len(x) for x in src_tokens]
        x = torch.LongTensor(len(src_tokens), max(src_len) + 1).fill_(vocab.pad())
        for i in range(len(src_tokens)):
            for j in range(len(src_tokens[i])):
                x[i][j] = vocab.index(src_tokens[i][j])
            x[i][j + 1] = vocab.eos()

        x = x.transpose(1, 0)
        return vocab, x, torch.LongTensor([i + 1 for i in src_len])
Example #4
0
    def _get_test_data(self, append_eos=True):
        vocab = Dictionary()
        vocab.add_symbol("he@@")
        vocab.add_symbol("llo")
        vocab.add_symbol("how")
        vocab.add_symbol("are")
        vocab.add_symbol("y@@")
        vocab.add_symbol("ou")
        vocab.add_symbol("n@@")
        vocab.add_symbol("ew")
        vocab.add_symbol("or@@")
        vocab.add_symbol("k")

        src_tokens = [
            ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
            ["how", "are", "y@@", "ou"],
        ]
        src_len = [len(x) for x in src_tokens]
        # If we have to append EOS, we include EOS in counting src length
        if append_eos:
            src_len = [length + 1 for length in src_len]

        x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad())
        for i in range(len(src_tokens)):
            for j in range(len(src_tokens[i])):
                x[i][j] = vocab.index(src_tokens[i][j])
            if append_eos:
                x[i][j + 1] = vocab.eos()

        x = x.transpose(1, 0)
        return vocab, x, torch.LongTensor(src_len)
Example #5
0
    def setup_task(cls, args, **kwargs):
        # Here we can perform any setup required for the task. This may include
        # loading Dictionaries, initializing shared Embedding layers, etc.
        # In this case we'll just load the Dictionaries.
        reloaded = torch.load(args.xlmr_model_dict)
        params = AttrDict(reloaded['params'])

        # build dictionary / update parameters
        input_vocab = Dictionary(reloaded['dico_id2word'],
                                 reloaded['dico_word2id'],
                                 reloaded['dico_counts'])
        params.n_words = len(input_vocab)
        params.bos_index = input_vocab.index(BOS_WORD)
        params.eos_index = input_vocab.index(EOS_WORD)
        params.pad_index = input_vocab.index(PAD_WORD)
        params.unk_index = input_vocab.index(UNK_WORD)
        params.mask_index = input_vocab.index(MASK_WORD)

        label_vocab = Dictionary.load(os.path.join(args.data,
                                                   'dict.label.txt'))
        print('| [input] dictionary: {} types'.format(len(input_vocab)))
        print('| [label] dictionary: {} types'.format(len(label_vocab)))

        return SemparseSeq2SeqTask(args, input_vocab, label_vocab)
Example #6
0
    def _convert_src_tokens_to_tensor(
        self, vocab: Dictionary, src_tokens: List[List[str]], append_eos: bool
    ):
        src_len = [len(x) for x in src_tokens]
        # If we have to append EOS, we include EOS in counting src length
        if append_eos:
            src_len = [length + 1 for length in src_len]

        x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad())
        for i in range(len(src_tokens)):
            for j in range(len(src_tokens[i])):
                x[i][j] = vocab.index(src_tokens[i][j])
            if append_eos:
                x[i][j + 1] = vocab.eos()

        x = x.transpose(1, 0)
        return x, torch.LongTensor(src_len)
Example #7
0
 def test_overwrite(self):
     # for example, Camembert overwrites <unk>, <s> and </s>
     dict_file = io.StringIO("<unk> 999 #fairseq:overwrite\n"
                             "<s> 999 #fairseq:overwrite\n"
                             "</s> 999 #fairseq:overwrite\n"
                             ", 999\n"
                             "▁de 999\n")
     d = Dictionary()
     d.add_from_file(dict_file)
     self.assertEqual(d.index("<pad>"), 1)
     self.assertEqual(d.index("foo"), 3)
     self.assertEqual(d.index("<unk>"), 4)
     self.assertEqual(d.index("<s>"), 5)
     self.assertEqual(d.index("</s>"), 6)
     self.assertEqual(d.index(","), 7)
     self.assertEqual(d.index("▁de"), 8)
Example #8
0
    def test_add_file_to_dict(self):
        counts = {}
        num_lines = 100
        per_line = 10
        with tempfile.TemporaryDirectory("test_sampling") as data_dir:
            filename = os.path.join(data_dir, "dummy.txt")
            with open(filename, "w", encoding="utf-8") as data:
                for c in string.ascii_letters:
                    line = f"{c} " * per_line
                    for _ in range(num_lines):
                        data.write(f"{line}\n")
                    counts[c] = per_line * num_lines
                    per_line += 5

            dict = Dictionary()
            Dictionary.add_file_to_dictionary(filename, dict,
                                              tokenizer.tokenize_line, 10)
            dict.finalize(threshold=0, nwords=-1, padding_factor=8)

            for c in string.ascii_letters:
                count = dict.get_count(dict.index(c))
                self.assertEqual(
                    counts[c], count,
                    f"{c} count is {count} but should be {counts[c]}")
def _lang_id(dic: Dictionary, lang: str):
    """Return language ID index."""
    idx = dic.index(lang)
    assert idx != dic.unk_index, "cannot find language ID for lang {}".format(lang)
    return idx
Example #10
0
def _lang_token_index(dic: Dictionary, lang: str):
    """Return language token index."""
    idx = dic.index(_lang_token(lang))
    assert idx != dic.unk_index, \
        'cannot find language token for lang {}'.format(lang)
    return idx
Example #11
0
 def get_lang_tag_idx(cls, lang: str, dictionary: Dictionary):
     lang_tag_idx = dictionary.index(cls.LANG_TAG_TEMPLATE.format(lang))
     assert lang_tag_idx != dictionary.unk()
     return lang_tag_idx
Example #12
0
def _lang_token_index(dic: Dictionary, lang: str, style="__{}__"):
    """Return language token index."""
    idx = dic.index(_lang_token(lang, style))
    assert idx != dic.unk_index, "cannot find language token for lang {}".format(lang)
    return idx
Example #13
0
class XLMRobertaTokenizer(PreTrainedTokenizer):
    """Custom tokenizer for our custom pretrained model. 
    You can ignore this file if you use another pretrained model. For example, if you use PhoBert, you should tokenize by using VnCoreNLP.
    """
    def __init__(self,
                 pretrained_file,
                 bos_token="<s>",
                 eos_token="</s>",
                 sep_token="</s>",
                 cls_token="<s>",
                 unk_token="<unk>",
                 pad_token="<pad>",
                 mask_token="<mask>",
                 **kwargs):
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            **kwargs,
        )

        # load bpe model and vocab file
        sentencepiece_model = pjoin(pretrained_file, 'sentencepiece.bpe.model')
        vocab_file = pjoin(pretrained_file, 'dict.txt')
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(
            sentencepiece_model
        )  # please dont use anything from sp_model bcz it makes everything goes wrong

        self.bpe_dict = Dictionary().load(vocab_file)

        # Mimic fairseq token-to-id alignment for the first 4 token
        self.fairseq_tokens_to_ids = {
            "<s>": 0,
            "<pad>": 1,
            "</s>": 2,
            "<unk>": 3
        }

        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
        self.fairseq_offset = 0

        self.fairseq_tokens_to_ids["<mask>"] = len(
            self.bpe_dict) + self.fairseq_offset
        self.fairseq_ids_to_tokens = {
            v: k
            for k, v in self.fairseq_tokens_to_ids.items()
        }

    def _tokenize(self, text):
        return self.sp_model.EncodeAsPieces(text)

    def _convert_token_to_id(self, token):
        """ Converts a token (str) in an id using the vocab. """
        if token in self.fairseq_tokens_to_ids:
            return self.fairseq_tokens_to_ids[token]
        spm_id = self.bpe_dict.index(token)
        return spm_id

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        if index in self.fairseq_ids_to_tokens:
            return self.fairseq_ids_to_tokens[index]
        return self.bpe_dict[index]

    @property
    def vocab_size(self):
        return len(
            self.bpe_dict) + self.fairseq_offset + 1  # Add the <mask> token

    def get_vocab(self):
        vocab = {
            self.convert_ids_to_tokens(i): i
            for i in range(self.vocab_size)
        }
        vocab.update(self.added_tokens_encoder)
        return vocab