def tokens_to_indices(self, tokens: List[str], vocab: Vocabulary) -> Dict[str, List[int]]: """ Takes a list of tokens and converts them to one or more sets of indices. During the indexing process, each item corresponds to an index in the vocabulary. Parameters ---------- vocab : ``Vocabulary`` ``vocab`` is used to get the index of each item. Returns ------- res : ``Dict[str, List[int]]`` if the token and index list is [w1:5, w2:3, w3:0], the result will be {'vocab_name' : [5, 3, 0]} """ res = {} for index_name in self.related_vocabs: index_list = [ vocab.get_token_index(self.transform(tok), index_name) for tok in tokens ] res[index_name] = index_list return res
def test_extend_from_pretrained_vocab(self): vocab = Vocabulary() # Test extend a vocabulary from a simple pretained vocab pretrained_vocabs = {'glove': ['a', 'b', 'c']} vocab.extend_from_pretrained_vocab(pretrained_vocabs) assert vocab.get_token_index('a', 'glove') == 2 assert vocab.get_token_index('c', 'glove') == 4 assert vocab.get_token_index('d', 'glove') == 0 # Test extend a vocabulary from a pretained vocabulary, # and intersect with another vocabulary. pretrained_vocabs = {'w2v': ['b', 'c', 'd']} vocab.extend_from_pretrained_vocab(pretrained_vocabs, {'w2v': 'glove'}) assert vocab.get_token_index('b', 'w2v') == 2 assert vocab.get_token_index('d', 'w2v') == 0 assert vocab.get_token_from_index(2, 'w2v') == 'b' with pytest.raises(RuntimeError) as excinfo: vocab.get_token_from_index(4, 'w2v') assert excinfo.type == RuntimeError # Test extend a vocabulary from a no oov pretained vocabulary pretrained_vocabs = {'glove_nounk': ['a', 'b', 'c']} vocab.extend_from_pretrained_vocab(pretrained_vocabs, no_unk_namespace={ 'glove_nounk', }) assert vocab.get_token_index('a', 'glove_nounk') == 1 assert vocab.get_token_index('c', 'glove_nounk') == 3 with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('d', 'glove_nounk') assert excinfo.type == RuntimeError # Test extend a vocabulary from a no oov and pad pretained vocabulary pretrained_vocabs = {'glove_nounk_nopad': ['a', 'b', 'c']} vocab.extend_from_pretrained_vocab( pretrained_vocabs, no_unk_namespace={ 'glove_nounk_nopad', }, no_pad_namespace={"glove_nounk_nopad"}) assert vocab.get_token_index('a', 'glove_nounk_nopad') == 0 assert vocab.get_token_index('c', 'glove_nounk_nopad') == 2 with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('d', 'glove_nounk_nopad') assert excinfo.type == RuntimeError
def test_extend_from_counter(self): vocab = Vocabulary() # Test extend a vocabulary from a simple counter counter = {'w': Counter(["This", "is", "a", "test", "sentence", '.'])} vocab.extend_from_counter(counter) assert vocab.get_token_index('a', 'w') == 4 assert vocab.get_token_index('.', 'w') == 7 assert vocab.get_token_index('That', 'w') == 0 # Test extend a vocabulary from a counter with min_count counter = {'w_m': Counter(['This', 'is', 'is'])} min_count = {'w_m': 2} vocab.extend_from_counter(counter, min_count) assert vocab.get_token_index('is', 'w_m') == 2 assert vocab.get_token_index('This', 'w_m') == 0 assert vocab.get_token_index('That', 'w_m') == 0 # Test extend a vocabulary from a counter without oov token counter = {'w_nounk': Counter(['This', 'is'])} vocab.extend_from_counter(counter, no_unk_namespace={ 'w_nounk', }) with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('That', 'w_nounk') assert excinfo.type == RuntimeError assert vocab.get_token_index('This', 'w_nounk') == 1 # Test extend a vocabulary from a counter without pad & unk token counter = {'w_nounk_nopad': Counter(['This', 'is', 'a'])} vocab.extend_from_counter(counter, no_unk_namespace={'w_nounk_nopad'}, no_pad_namespace={'w_nounk_nopad'}) with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('That', 'w_nounk_nopad') assert excinfo.type == RuntimeError assert vocab.get_token_index('This', 'w_nounk_nopad') == 0
def tokens_to_indices(self, tokens: List[str], vocab: Vocabulary) -> Dict[str, List[List[int]]]: """ Takes a list of tokens and converts them to one or more sets of indices. During the indexing process, each token item corresponds to a list of index in the vocabulary. Parameters ---------- vocab : ``Vocabulary`` ``vocab`` is used to get the index of each item. """ res = {} for vocab_name in self.related_vocabs: index_list = [] for token in tokens: index_list.append([ vocab.get_token_index(self.transform(ch), vocab_name) for ch in token ]) res[vocab_name] = index_list return res
def test_vocabulary(self): pretrained_vocabs = { 'glove': ['a', 'b', 'c'], 'w2v': ['b', 'c', 'd'], 'glove_nounk': ['a', 'b', 'c'], 'glove_nounk_nopad': ['a', 'b', 'c'] } counters = { 'w': Counter(["This", "is", "a", "test", "sentence", '.']), 'w_m': Counter(['This', 'is', 'is']), 'w_nounk': Counter(['This', 'is']), 'w_nounk_nopad': Counter(['This', 'is', 'a']) } vocab = Vocabulary( counters=counters, min_count={'w_m': 2}, pretrained_vocab=pretrained_vocabs, intersection_vocab={'w2v': 'glove'}, no_pad_namespace={'glove_nounk_nopad', 'w_nounk_nopad'}, no_unk_namespace={ 'glove_nounk', 'w_nounk', 'glove_nounk_nopad', 'w_nounk_nopad' }) # Test glove print(vocab.get_vocab_size('glove')) assert vocab.get_token_index('a', 'glove') == 2 assert vocab.get_token_index('c', 'glove') == 4 assert vocab.get_token_index('d', 'glove') == 0 # Test w2v assert vocab.get_token_index('b', 'w2v') == 2 assert vocab.get_token_index('d', 'w2v') == 0 assert vocab.get_token_from_index(2, 'w2v') == 'b' with pytest.raises(RuntimeError) as excinfo: vocab.get_token_from_index(4, 'w2v') assert excinfo.type == RuntimeError # Test glove_nounk assert vocab.get_token_index('a', 'glove_nounk') == 1 assert vocab.get_token_index('c', 'glove_nounk') == 3 with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('d', 'glove_nounk') assert excinfo.type == RuntimeError # Test glove_nounk_nopad assert vocab.get_token_index('a', 'glove_nounk_nopad') == 0 assert vocab.get_token_index('c', 'glove_nounk_nopad') == 2 with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('d', 'glove_nounk_nopad') assert excinfo.type == RuntimeError # Test w assert vocab.get_token_index('a', 'w') == 4 assert vocab.get_token_index('.', 'w') == 7 assert vocab.get_token_index('That', 'w') == 0 # Test w_m assert vocab.get_token_index('is', 'w_m') == 2 assert vocab.get_token_index('This', 'w_m') == 0 assert vocab.get_token_index('That', 'w_m') == 0 # Test w_nounk with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('That', 'w_nounk') assert excinfo.type == RuntimeError assert vocab.get_token_index('This', 'w_nounk') == 1 # Test w_nounk_nopad with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('That', 'w_nounk_nopad') assert excinfo.type == RuntimeError assert vocab.get_token_index('This', 'w_nounk_nopad') == 0