Python Dictionary.index Examples

Programming Language: Python

Namespace/Package Name: fairseq.data

Class/Type: Dictionary

Method/Function: index

Examples at hotexamples.com: 13

Python Dictionary.index - 13 examples found. These are the top rated real world Python examples of fairseq.data.Dictionary.index extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Dictionary(30)

add_from_file(30)

add_symbol(30)

load(30)

encode_line(17)

finalize(15)

index(13)

add_file_to_dictionary(9)

pad(7)

string(7)

eos(5)

save(3)

pad_to_multiple_(3)

eos_index(2)

pad_index(2)

symbols(1)

get_count(1)

nspecial(1)

load_from_json(1)

indices(1)

__len__(1)

count(1)

bos_index(1)

bos(1)

unk(1)

Example #1

Show file

 def test_space(self):
     # for example, character models treat space as a symbol
     dict_file = io.StringIO("  999\n" "a 999\n" "b 999\n")
     d = Dictionary()
     d.add_from_file(dict_file)
     self.assertEqual(d.index(" "), 4)
     self.assertEqual(d.index("a"), 5)
     self.assertEqual(d.index("b"), 6)

Example #2

Show file

    def test_character_token_embedder(self):
        vocab = Dictionary()
        vocab.add_symbol('hello')
        vocab.add_symbol('there')

        embedder = CharacterTokenEmbedder(vocab, [(2, 16), (4, 32), (8, 64),
                                                  (16, 2)], 64, 5, 2)

        test_sents = [['hello', 'unk', 'there'], ['there'], ['hello', 'there']]
        max_len = max(len(s) for s in test_sents)
        input = torch.LongTensor(len(test_sents),
                                 max_len + 2).fill_(vocab.pad())
        for i in range(len(test_sents)):
            input[i][0] = vocab.eos()
            for j in range(len(test_sents[i])):
                input[i][j + 1] = vocab.index(test_sents[i][j])
            input[i][j + 2] = vocab.eos()
        embs = embedder(input)

        assert embs.size() == (len(test_sents), max_len + 2, 5)
        self.assertAlmostEqual(embs[0][0], embs[1][0])
        self.assertAlmostEqual(embs[0][0], embs[0][-1])
        self.assertAlmostEqual(embs[0][1], embs[2][1])
        self.assertAlmostEqual(embs[0][3], embs[1][1])

        embs.sum().backward()
        assert embedder.char_embeddings.weight.grad is not None

Example #3

Show file

File: test_noising.py Project: hadyelsahar/fairseq

    def _get_test_data(self):
        vocab = Dictionary()
        vocab.add_symbol("he@@")
        vocab.add_symbol("llo")
        vocab.add_symbol("how")
        vocab.add_symbol("are")
        vocab.add_symbol("y@@")
        vocab.add_symbol("ou")
        vocab.add_symbol("n@@")
        vocab.add_symbol("ew")
        vocab.add_symbol("or@@")
        vocab.add_symbol("k")

        src_tokens = [
            ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
            ["how", "are", "y@@", "ou"],
        ]
        src_len = [len(x) for x in src_tokens]
        x = torch.LongTensor(len(src_tokens), max(src_len) + 1).fill_(vocab.pad())
        for i in range(len(src_tokens)):
            for j in range(len(src_tokens[i])):
                x[i][j] = vocab.index(src_tokens[i][j])
            x[i][j + 1] = vocab.eos()

        x = x.transpose(1, 0)
        return vocab, x, torch.LongTensor([i + 1 for i in src_len])

Example #4

Show file

    def _get_test_data(self, append_eos=True):
        vocab = Dictionary()
        vocab.add_symbol("he@@")
        vocab.add_symbol("llo")
        vocab.add_symbol("how")
        vocab.add_symbol("are")
        vocab.add_symbol("y@@")
        vocab.add_symbol("ou")
        vocab.add_symbol("n@@")
        vocab.add_symbol("ew")
        vocab.add_symbol("or@@")
        vocab.add_symbol("k")

        src_tokens = [
            ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
            ["how", "are", "y@@", "ou"],
        ]
        src_len = [len(x) for x in src_tokens]
        # If we have to append EOS, we include EOS in counting src length
        if append_eos:
            src_len = [length + 1 for length in src_len]

        x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad())
        for i in range(len(src_tokens)):
            for j in range(len(src_tokens[i])):
                x[i][j] = vocab.index(src_tokens[i][j])
            if append_eos:
                x[i][j + 1] = vocab.eos()

        x = x.transpose(1, 0)
        return vocab, x, torch.LongTensor(src_len)

Example #5

Show file

File: tasks.py Project: nilesh-c/kgqa

    def setup_task(cls, args, **kwargs):
        # Here we can perform any setup required for the task. This may include
        # loading Dictionaries, initializing shared Embedding layers, etc.
        # In this case we'll just load the Dictionaries.
        reloaded = torch.load(args.xlmr_model_dict)
        params = AttrDict(reloaded['params'])

        # build dictionary / update parameters
        input_vocab = Dictionary(reloaded['dico_id2word'],
                                 reloaded['dico_word2id'],
                                 reloaded['dico_counts'])
        params.n_words = len(input_vocab)
        params.bos_index = input_vocab.index(BOS_WORD)
        params.eos_index = input_vocab.index(EOS_WORD)
        params.pad_index = input_vocab.index(PAD_WORD)
        params.unk_index = input_vocab.index(UNK_WORD)
        params.mask_index = input_vocab.index(MASK_WORD)

        label_vocab = Dictionary.load(os.path.join(args.data,
                                                   'dict.label.txt'))
        print('| [input] dictionary: {} types'.format(len(input_vocab)))
        print('| [label] dictionary: {} types'.format(len(label_vocab)))

        return SemparseSeq2SeqTask(args, input_vocab, label_vocab)

Example #6

Show file

File: test_noising.py Project: Silent-Zebra/reproduce

    def _convert_src_tokens_to_tensor(
        self, vocab: Dictionary, src_tokens: List[List[str]], append_eos: bool
    ):
        src_len = [len(x) for x in src_tokens]
        # If we have to append EOS, we include EOS in counting src length
        if append_eos:
            src_len = [length + 1 for length in src_len]

        x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad())
        for i in range(len(src_tokens)):
            for j in range(len(src_tokens[i])):
                x[i][j] = vocab.index(src_tokens[i][j])
            if append_eos:
                x[i][j + 1] = vocab.eos()

        x = x.transpose(1, 0)
        return x, torch.LongTensor(src_len)

Example #7

Show file

 def test_overwrite(self):
     # for example, Camembert overwrites <unk>, <s> and </s>
     dict_file = io.StringIO("<unk> 999 #fairseq:overwrite\n"
                             "<s> 999 #fairseq:overwrite\n"
                             "</s> 999 #fairseq:overwrite\n"
                             ", 999\n"
                             "▁de 999\n")
     d = Dictionary()
     d.add_from_file(dict_file)
     self.assertEqual(d.index("<pad>"), 1)
     self.assertEqual(d.index("foo"), 3)
     self.assertEqual(d.index("<unk>"), 4)
     self.assertEqual(d.index("<s>"), 5)
     self.assertEqual(d.index("</s>"), 6)
     self.assertEqual(d.index(","), 7)
     self.assertEqual(d.index("▁de"), 8)

Example #8

Show file

    def test_add_file_to_dict(self):
        counts = {}
        num_lines = 100
        per_line = 10
        with tempfile.TemporaryDirectory("test_sampling") as data_dir:
            filename = os.path.join(data_dir, "dummy.txt")
            with open(filename, "w", encoding="utf-8") as data:
                for c in string.ascii_letters:
                    line = f"{c} " * per_line
                    for _ in range(num_lines):
                        data.write(f"{line}\n")
                    counts[c] = per_line * num_lines
                    per_line += 5

            dict = Dictionary()
            Dictionary.add_file_to_dictionary(filename, dict,
                                              tokenizer.tokenize_line, 10)
            dict.finalize(threshold=0, nwords=-1, padding_factor=8)

            for c in string.ascii_letters:
                count = dict.get_count(dict.index(c))
                self.assertEqual(
                    counts[c], count,
                    f"{c} count is {count} but should be {counts[c]}")

Example #9

Show file

File: multilingual_data_manager.py Project: zentim/fairseq

def _lang_id(dic: Dictionary, lang: str):
    """Return language ID index."""
    idx = dic.index(lang)
    assert idx != dic.unk_index, "cannot find language ID for lang {}".format(lang)
    return idx

Example #10

Show file

def _lang_token_index(dic: Dictionary, lang: str):
    """Return language token index."""
    idx = dic.index(_lang_token(lang))
    assert idx != dic.unk_index, \
        'cannot find language token for lang {}'.format(lang)
    return idx

Example #11

Show file

File: speech_to_text_dataset.py Project: ishine/fairseq

 def get_lang_tag_idx(cls, lang: str, dictionary: Dictionary):
     lang_tag_idx = dictionary.index(cls.LANG_TAG_TEMPLATE.format(lang))
     assert lang_tag_idx != dictionary.unk()
     return lang_tag_idx

Example #12

Show file

def _lang_token_index(dic: Dictionary, lang: str, style="__{}__"):
    """Return language token index."""
    idx = dic.index(_lang_token(lang, style))
    assert idx != dic.unk_index, "cannot find language token for lang {}".format(lang)
    return idx

Example #13

Show file

class XLMRobertaTokenizer(PreTrainedTokenizer):
    """Custom tokenizer for our custom pretrained model. 
    You can ignore this file if you use another pretrained model. For example, if you use PhoBert, you should tokenize by using VnCoreNLP.
    """
    def __init__(self,
                 pretrained_file,
                 bos_token="<s>",
                 eos_token="</s>",
                 sep_token="</s>",
                 cls_token="<s>",
                 unk_token="<unk>",
                 pad_token="<pad>",
                 mask_token="<mask>",
                 **kwargs):
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            **kwargs,
        )

        # load bpe model and vocab file
        sentencepiece_model = pjoin(pretrained_file, 'sentencepiece.bpe.model')
        vocab_file = pjoin(pretrained_file, 'dict.txt')
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(
            sentencepiece_model
        )  # please dont use anything from sp_model bcz it makes everything goes wrong

        self.bpe_dict = Dictionary().load(vocab_file)

        # Mimic fairseq token-to-id alignment for the first 4 token
        self.fairseq_tokens_to_ids = {
            "<s>": 0,
            "<pad>": 1,
            "</s>": 2,
            "<unk>": 3
        }

        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
        self.fairseq_offset = 0

        self.fairseq_tokens_to_ids["<mask>"] = len(
            self.bpe_dict) + self.fairseq_offset
        self.fairseq_ids_to_tokens = {
            v: k
            for k, v in self.fairseq_tokens_to_ids.items()
        }

    def _tokenize(self, text):
        return self.sp_model.EncodeAsPieces(text)

    def _convert_token_to_id(self, token):
        """ Converts a token (str) in an id using the vocab. """
        if token in self.fairseq_tokens_to_ids:
            return self.fairseq_tokens_to_ids[token]
        spm_id = self.bpe_dict.index(token)
        return spm_id

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        if index in self.fairseq_ids_to_tokens:
            return self.fairseq_ids_to_tokens[index]
        return self.bpe_dict[index]

    @property
    def vocab_size(self):
        return len(
            self.bpe_dict) + self.fairseq_offset + 1  # Add the <mask> token

    def get_vocab(self):
        vocab = {
            self.convert_ids_to_tokens(i): i
            for i in range(self.vocab_size)
        }
        vocab.update(self.added_tokens_encoder)
        return vocab