def main(): # Configuration file processing ... # DyNet setting ... # Build the dataset of the training process ## Build data reader data_reader = PTBReader( field_list=['word', 'tag', 'head', 'rel'], root='0\t**root**\t_\t**rpos**\t_\t_\t0\t**rrel**\t_\t_', spacer=r'[\t]',) ## Build vocabulary with pretrained glove vocabulary = Vocabulary() g_word, _ = glove_reader(cfg.GLOVE) pretrained_vocabs = {'glove': g_word} vocabulary.extend_from_pretrained_vocab(pretrained_vocabs) ## Setup datasets datasets_settings = { 'train': DatasetSetting(cfg.TRAIN, True), 'dev': DatasetSetting(cfg.DEV, True), 'test': DatasetSetting(cfg.TEST, True),} datasets = SingleTaskDataset(vocabulary, datasets_settings, data_reader) counters = {'word': Counter(), 'tag': Counter(), 'rel': Counter()} datasets.build_dataset( counters, no_pad_namespace={'rel'}, no_unk_namespace={'rel'}) # Build model ... # Train model train_batch = datasets.get_batches('train', cfg.TRAIN_BATCH_SIZE, True, cmp, True) valid_batch = datasets.get_batches('dev', cfg.TEST_BATCH_SIZE, True, cmp, False) test_batch = datasets.get_batches('test', cfg.TEST_BATCH_SIZE, True, cmp, False)
def tokens_to_indices(self, tokens: List[str], vocab: Vocabulary) -> Dict[str, List[int]]: """ Takes a list of tokens and converts them to one or more sets of indices. During the indexing process, each item corresponds to an index in the vocabulary. Parameters ---------- vocab : ``Vocabulary`` ``vocab`` is used to get the index of each item. Returns ------- res : ``Dict[str, List[int]]`` if the token and index list is [w1:5, w2:3, w3:0], the result will be {'vocab_name' : [5, 3, 0]} """ res = {} for index_name in self.related_vocabs: index_list = [ vocab.get_token_index(self.transform(tok), index_name) for tok in tokens ] res[index_name] = index_list return res
def test_instance(self): sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence'] counter = {'my_word': Counter(), 'my_char': Counter()} vocab = Vocabulary() glove = ['This', 'is', 'glove', 'sentence', 'vocabulary'] vocab.extend_from_pretrained_vocab({'glove': glove}) single_id = SingleIdTokenIndexer(['my_word', 'glove']) char = CharTokenIndexer(['my_char']) sent = TextField('sentence', sentence, [single_id, char]) data = Instance([sent]) # Test count_vocab_items() data.count_vocab_items(counter) assert counter['my_word']['This'] == 1 assert counter['my_word']['is'] == 2 assert counter['my_word']['That'] == 0 assert counter['my_char']['s'] == 5 assert counter['my_char']['T'] == 1 assert counter['my_char']['t'] == 3 assert counter['my_char']['A'] == 0 vocab.extend_from_counter(counter) # Test index() result = data.index_fields(vocab) assert result['sentence']['glove'] == [2, 3, 3, 0, 0, 0, 5] assert result['sentence']['my_word'] == [2, 3, 3, 4, 4, 5, 6] assert result['sentence']['my_char'][0] == [2, 3, 4, 5] # 'This' assert result['sentence']['my_char'][1] == result['sentence'][ 'my_char'][2] assert result['sentence']['my_char'][3] == result['sentence'][ 'my_char'][4]
def __init__(self, model, cfg, vocabulary: Vocabulary): pc = model.add_subcollection() word_num = vocabulary.get_vocab_size('word') self.wlookup = pc.lookup_parameters_from_numpy( np.zeros((word_num, cfg.WORD_DIM), dtype=np.float32)) tag_num = vocabulary.get_vocab_size('tag') self.tlookup = pc.lookup_parameters_from_numpy( np.random.randn(tag_num, cfg.TAG_DIM).astype(np.float32)) _, glove_vec = glove_reader(cfg.GLOVE) glove_dim = len(glove_vec[0]) unk_pad_vec = [[0.0 for _ in range(glove_dim)]] glove_num = vocabulary.get_vocab_size('glove') glove_vec = unk_pad_vec + unk_pad_vec + glove_vec glove_vec = np.array(glove_vec, dtype=np.float32) / np.std(glove_vec) self.glookup = pc.lookup_parameters_from_numpy( glove_vec.astype(np.float32)) self.token_dim = cfg.WORD_DIM + cfg.TAG_DIM self.vocabulary = vocabulary self.pc, self.cfg = pc, cfg self.spec = (cfg, vocabulary)
def tokens_to_indices(self, tokens: List[str], vocab: Vocabulary) -> Dict[str, List[List[int]]]: """ Takes a list of tokens and converts them to one or more sets of indices. During the indexing process, each token item corresponds to a list of index in the vocabulary. Parameters ---------- vocab : ``Vocabulary`` ``vocab`` is used to get the index of each item. """ res = {} for vocab_name in self.related_vocabs: index_list = [] for token in tokens: index_list.append([ vocab.get_token_index(self.transform(ch), vocab_name) for ch in token ]) res[vocab_name] = index_list return res
def test_single_id_token_indexer(self): sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence'] counter = {'my_word': Counter()} vocab = Vocabulary() glove = ['This', 'is', 'glove', 'sentence', 'vocabulary'] vocab.extend_from_pretrained_vocab({'glove': glove}) indexer = SingleIdTokenIndexer(['my_word', 'glove']) sent = TextField('sentence', sentence, [indexer]) # Test count_vocab_items() sent.count_vocab_items(counter) assert counter['my_word']['This'] == 1 assert counter['my_word']['is'] == 2 assert counter['my_word']['That'] == 0 vocab.extend_from_counter(counter) # Test index() sent.index(vocab) assert sent.indexes['glove'] == [2, 3, 3, 0, 0, 0, 5] assert sent.indexes['my_word'] == [2, 3, 3, 4, 4, 5, 6]
def test_char_token_indexer(self): sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence'] counter = {'my_char': Counter()} vocab = Vocabulary() glove = ['a', 'b', 'c', 'd', 'e'] vocab.extend_from_pretrained_vocab({'glove': glove}) indexer = CharTokenIndexer(['my_char', 'glove']) sent = TextField('sentence', sentence, [indexer]) # Test count_vocab_items() sent.count_vocab_items(counter) assert counter['my_char']['s'] == 5 assert counter['my_char']['T'] == 1 assert counter['my_char']['t'] == 3 assert counter['my_char']['A'] == 0 vocab.extend_from_counter(counter) # Test index() sent.index(vocab) assert sent.indexes['glove'][0] == [0, 0, 0, 0] # 'This' assert sent.indexes['glove'][3] == [2] # 'a' assert sent.indexes['my_char'][0] == [2, 3, 4, 5] # 'This'
def test_vocabulary(self): pretrained_vocabs = { 'glove': ['a', 'b', 'c'], 'w2v': ['b', 'c', 'd'], 'glove_nounk': ['a', 'b', 'c'], 'glove_nounk_nopad': ['a', 'b', 'c'] } counters = { 'w': Counter(["This", "is", "a", "test", "sentence", '.']), 'w_m': Counter(['This', 'is', 'is']), 'w_nounk': Counter(['This', 'is']), 'w_nounk_nopad': Counter(['This', 'is', 'a']) } vocab = Vocabulary( counters=counters, min_count={'w_m': 2}, pretrained_vocab=pretrained_vocabs, intersection_vocab={'w2v': 'glove'}, no_pad_namespace={'glove_nounk_nopad', 'w_nounk_nopad'}, no_unk_namespace={ 'glove_nounk', 'w_nounk', 'glove_nounk_nopad', 'w_nounk_nopad' }) # Test glove print(vocab.get_vocab_size('glove')) assert vocab.get_token_index('a', 'glove') == 2 assert vocab.get_token_index('c', 'glove') == 4 assert vocab.get_token_index('d', 'glove') == 0 # Test w2v assert vocab.get_token_index('b', 'w2v') == 2 assert vocab.get_token_index('d', 'w2v') == 0 assert vocab.get_token_from_index(2, 'w2v') == 'b' with pytest.raises(RuntimeError) as excinfo: vocab.get_token_from_index(4, 'w2v') assert excinfo.type == RuntimeError # Test glove_nounk assert vocab.get_token_index('a', 'glove_nounk') == 1 assert vocab.get_token_index('c', 'glove_nounk') == 3 with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('d', 'glove_nounk') assert excinfo.type == RuntimeError # Test glove_nounk_nopad assert vocab.get_token_index('a', 'glove_nounk_nopad') == 0 assert vocab.get_token_index('c', 'glove_nounk_nopad') == 2 with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('d', 'glove_nounk_nopad') assert excinfo.type == RuntimeError # Test w assert vocab.get_token_index('a', 'w') == 4 assert vocab.get_token_index('.', 'w') == 7 assert vocab.get_token_index('That', 'w') == 0 # Test w_m assert vocab.get_token_index('is', 'w_m') == 2 assert vocab.get_token_index('This', 'w_m') == 0 assert vocab.get_token_index('That', 'w_m') == 0 # Test w_nounk with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('That', 'w_nounk') assert excinfo.type == RuntimeError assert vocab.get_token_index('This', 'w_nounk') == 1 # Test w_nounk_nopad with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('That', 'w_nounk_nopad') assert excinfo.type == RuntimeError assert vocab.get_token_index('This', 'w_nounk_nopad') == 0
def test_extend_from_pretrained_vocab(self): vocab = Vocabulary() # Test extend a vocabulary from a simple pretained vocab pretrained_vocabs = {'glove': ['a', 'b', 'c']} vocab.extend_from_pretrained_vocab(pretrained_vocabs) assert vocab.get_token_index('a', 'glove') == 2 assert vocab.get_token_index('c', 'glove') == 4 assert vocab.get_token_index('d', 'glove') == 0 # Test extend a vocabulary from a pretained vocabulary, # and intersect with another vocabulary. pretrained_vocabs = {'w2v': ['b', 'c', 'd']} vocab.extend_from_pretrained_vocab(pretrained_vocabs, {'w2v': 'glove'}) assert vocab.get_token_index('b', 'w2v') == 2 assert vocab.get_token_index('d', 'w2v') == 0 assert vocab.get_token_from_index(2, 'w2v') == 'b' with pytest.raises(RuntimeError) as excinfo: vocab.get_token_from_index(4, 'w2v') assert excinfo.type == RuntimeError # Test extend a vocabulary from a no oov pretained vocabulary pretrained_vocabs = {'glove_nounk': ['a', 'b', 'c']} vocab.extend_from_pretrained_vocab(pretrained_vocabs, no_unk_namespace={ 'glove_nounk', }) assert vocab.get_token_index('a', 'glove_nounk') == 1 assert vocab.get_token_index('c', 'glove_nounk') == 3 with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('d', 'glove_nounk') assert excinfo.type == RuntimeError # Test extend a vocabulary from a no oov and pad pretained vocabulary pretrained_vocabs = {'glove_nounk_nopad': ['a', 'b', 'c']} vocab.extend_from_pretrained_vocab( pretrained_vocabs, no_unk_namespace={ 'glove_nounk_nopad', }, no_pad_namespace={"glove_nounk_nopad"}) assert vocab.get_token_index('a', 'glove_nounk_nopad') == 0 assert vocab.get_token_index('c', 'glove_nounk_nopad') == 2 with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('d', 'glove_nounk_nopad') assert excinfo.type == RuntimeError
def test_extend_from_counter(self): vocab = Vocabulary() # Test extend a vocabulary from a simple counter counter = {'w': Counter(["This", "is", "a", "test", "sentence", '.'])} vocab.extend_from_counter(counter) assert vocab.get_token_index('a', 'w') == 4 assert vocab.get_token_index('.', 'w') == 7 assert vocab.get_token_index('That', 'w') == 0 # Test extend a vocabulary from a counter with min_count counter = {'w_m': Counter(['This', 'is', 'is'])} min_count = {'w_m': 2} vocab.extend_from_counter(counter, min_count) assert vocab.get_token_index('is', 'w_m') == 2 assert vocab.get_token_index('This', 'w_m') == 0 assert vocab.get_token_index('That', 'w_m') == 0 # Test extend a vocabulary from a counter without oov token counter = {'w_nounk': Counter(['This', 'is'])} vocab.extend_from_counter(counter, no_unk_namespace={ 'w_nounk', }) with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('That', 'w_nounk') assert excinfo.type == RuntimeError assert vocab.get_token_index('This', 'w_nounk') == 1 # Test extend a vocabulary from a counter without pad & unk token counter = {'w_nounk_nopad': Counter(['This', 'is', 'a'])} vocab.extend_from_counter(counter, no_unk_namespace={'w_nounk_nopad'}, no_pad_namespace={'w_nounk_nopad'}) with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('That', 'w_nounk_nopad') assert excinfo.type == RuntimeError assert vocab.get_token_index('This', 'w_nounk_nopad') == 0