def __init__(self, model: dy.ParameterCollection, cfg: IniConfigurator, vocabulary: Vocabulary): pc = model.add_subcollection() # MLP layer orth_init = OrthogonalInitializer self.head_arc_MLP = MLP(pc, cfg.ARC_MLP_SIZE, leaky_relu, cfg.MLP_DROP, cfg.MLP_BIAS, orth_init) self.head_rel_MLP = MLP(pc, cfg.REL_MLP_SIZE, leaky_relu, cfg.MLP_DROP, cfg.MLP_BIAS, orth_init) self.dept_arc_MLP = MLP(pc, cfg.ARC_MLP_SIZE, leaky_relu, cfg.MLP_DROP, cfg.MLP_BIAS, orth_init) self.dept_rel_MLP = MLP(pc, cfg.REL_MLP_SIZE, leaky_relu, cfg.MLP_DROP, cfg.MLP_BIAS, orth_init) # Biaffine Attention Layer (Arc) arc_size = cfg.ARC_MLP_SIZE[-1] zero_init = dy.ConstInitializer(0) self.arc_attn_mat = [ BiaffineMatAttention(pc, arc_size, arc_size, 1, True, False, zero_init) for _ in range(cfg.GRAPH_LAYERS + 1) ] # Biaffine Attention Layer (Rel) rel_num = vocabulary.get_vocab_size('rel') rel_size = cfg.REL_MLP_SIZE[-1] self.rel_mask = np.array([1] + [0] * (rel_num - 1)) # mask root relation self.rel_attn = BiaffineMatAttention(pc, rel_size, rel_size, rel_num, True, True, zero_init) # Graph Network Layer self.head_gnn = GraphNNUnit(pc, arc_size, arc_size, leaky_relu, orth_init) self.dept_gnn = GraphNNUnit(pc, arc_size, arc_size, leaky_relu, orth_init) self.head_rel_gnn = GraphNNUnit(pc, rel_size, rel_size, leaky_relu, orth_init) self.dept_rel_gnn = GraphNNUnit(pc, rel_size, rel_size, leaky_relu, orth_init) # Graph Layer WarmUp self.warm_list = [ -i * cfg.WARM for i in range(cfg.GRAPH_LAYERS, -1, -1) ] # Save Variable self.arc_size, self.rel_size, self.rel_num = arc_size, rel_size, rel_num self.pc, self.cfg = pc, cfg self.spec = (cfg, vocabulary)
def test_single_id_token_indexer(self): sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence'] counter = {'my_word': Counter()} vocab = Vocabulary() glove = ['This', 'is', 'glove', 'sentence', 'vocabulary'] vocab.extend_from_pretrained_vocab({'glove': glove}) indexer = SingleIdTokenIndexer(['my_word', 'glove']) sent = TextField('sentence', sentence, [indexer]) # Test count_vocab_items() sent.count_vocab_items(counter) assert counter['my_word']['This'] == 1 assert counter['my_word']['is'] == 2 assert counter['my_word']['That'] == 0 vocab.extend_from_counter(counter) # Test index() sent.index(vocab) assert sent.indexes['glove'] == [2, 3, 3, 0, 0, 0, 5] assert sent.indexes['my_word'] == [2, 3, 3, 4, 4, 5, 6]
def test_char_token_indexer(self): sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence'] counter = {'my_char': Counter()} vocab = Vocabulary() glove = ['a', 'b', 'c', 'd', 'e'] vocab.extend_from_pretrained_vocab({'glove': glove}) indexer = CharTokenIndexer(['my_char', 'glove']) sent = TextField('sentence', sentence, [indexer]) # Test count_vocab_items() sent.count_vocab_items(counter) assert counter['my_char']['s'] == 5 assert counter['my_char']['T'] == 1 assert counter['my_char']['t'] == 3 assert counter['my_char']['A'] == 0 vocab.extend_from_counter(counter) # Test index() sent.index(vocab) assert sent.indexes['glove'][0] == [0, 0, 0, 0] # 'This' assert sent.indexes['glove'][3] == [2] # 'a' assert sent.indexes['my_char'][0] == [2, 3, 4, 5] # 'This'
def main(): # Configuration file processing argparser = argparse.ArgumentParser() argparser.add_argument('--config_file', default='../configs/debug.cfg') argparser.add_argument('--continue_training', action='store_true', help='Load model Continue Training') argparser.add_argument('--name', default='experiment', help='The name of the experiment.') argparser.add_argument('--model', default='s2s', help='s2s: seq2seq-head-selection-model' 's2tDFS: seq2tree-DFS-decoder-model') argparser.add_argument('--gpu', default='0', help='GPU ID (-1 to cpu)') args, extra_args = argparser.parse_known_args() cfg = IniConfigurator(args.config_file, extra_args) # Logger setting logger = dual_channel_logger( __name__, file_path=cfg.LOG_FILE, file_model='w', formatter='%(asctime)s - %(levelname)s - %(message)s', time_formatter='%m-%d %H:%M') from eval.script_evaluator import ScriptEvaluator # DyNet setting os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu import dynet_config dynet_config.set(mem=cfg.DYNET_MEM, random_seed=cfg.DYNET_SEED) dynet_config.set_gpu() import dynet as dy from models.token_representation import TokenRepresentation from antu.nn.dynet.seq2seq_encoders import DeepBiRNNBuilder, orthonormal_VanillaLSTMBuilder from models.graph_nn_decoder import GraphNNDecoder from models.jackknife_decoder import JackKnifeGraphNNDecoder # Build the dataset of the training process # Build data reader data_reader = PTBReader( field_list=['word', 'tag', 'head', 'rel'], root='0\t**root**\t_\t**rcpos**\t**rpos**\t_\t0\t**rrel**\t_\t_', spacer=r'[\t]',) # Build vocabulary with pretrained glove vocabulary = Vocabulary() g_word, _ = glove_reader(cfg.GLOVE) pretrained_vocabs = {'glove': g_word} vocabulary.extend_from_pretrained_vocab(pretrained_vocabs) # Setup datasets datasets_settings = {'train': DatasetSetting(cfg.TRAIN, True), 'dev': DatasetSetting(cfg.DEV, False), 'test': DatasetSetting(cfg.TEST, False), } datasets = PTBDataset(vocabulary, datasets_settings, data_reader) counters = {'word': Counter(), 'tag': Counter(), 'rel': Counter()} datasets.build_dataset(counters, no_pad_namespace={'rel'}, no_unk_namespace={'rel'}) # Build model # Parameter pc = dy.ParameterCollection() LR = 0.0005 trainer = dy.AdamTrainer(pc, LR, cfg.ADAM_BETA1, cfg.ADAM_BETA2, cfg.EPS) # Token Representation Layer token_repre = TokenRepresentation(pc, cfg, datasets.vocabulary, include_pos=True) # BiLSTM Encoder Layer #encoder = BiaffineAttention() #encoder = MultiHeadedAttention(pc, 10, token_repre.token_dim) #encoder = MultiLayerMultiHeadAttention(pc, 10, token_repre.token_dim, num_layers=1) #encoder = MyMultiHeadAttention(None, 6, token_repre.token_dim, 32, 32, model=pc) #encoder = LabelAttention(None, token_repre.token_dim, 128, 128, 112, 128, use_resdrop=True, q_as_matrix=False, residual_dropout=0.1, attention_dropout=0.1, d_positional=None, model=pc) # encoder = Encoder(None, token_repre.token_dim, # num_layers=1, num_heads=2, d_kv = 32, d_ff=1024, d_l=112, # d_positional=None, # num_layers_position_only=0, # relu_dropout=0.1, residual_dropout=0.1, attention_dropout=0.1, # use_lal=True, # lal_d_kv=128, # lal_d_proj=128, # lal_resdrop=True, # lal_pwff=True, # lal_q_as_matrix=False, # lal_partitioned=True, # model=pc) #encoder = ScaledDotProductAttention(pc, 10) encoder = DeepBiRNNBuilder(pc, cfg.ENC_LAYERS, token_repre.token_dim, cfg.ENC_H_DIM, orthonormal_VanillaLSTMBuilder) # GNN Decoder Layer decoder = GraphNNDecoder(pc, cfg, datasets.vocabulary) #decoder = JackKnifeGraphNNDecoder(pc, cfg, datasets.vocabulary) # PTB Evaluator my_eval = ScriptEvaluator(['Valid', 'Test'], datasets.vocabulary) #dy.save(cfg.LAST_FILE, [token_repre, encoder, decoder]) #exit(0) # Build Training Batch def cmp(ins): return len(ins['word']) train_batch = datasets.get_batches('train', cfg.TRAIN_BATCH_SIZE, True, cmp, True) valid_batch = list(datasets.get_batches('dev', cfg.TEST_BATCH_SIZE, False, cmp, False)) test_batch = list(datasets.get_batches('test', cfg.TEST_BATCH_SIZE, False, cmp, False)) #print('-----------------------') # print('TRAIN BATCH IS: ') # # print(train_batch) # indexes, masks, truth = train_batch.__next__() # print(indexes) # print('------------------',end='\n\n\n\n\n\n\n') # print(len(indexes)) # exit(0) # exit(0) # for k in indexes: # print(k) #print(indexes) #print(masks) # Train model BEST_DEV_LAS = BEST_DEV_UAS = BEST_ITER = 0 cnt_iter = -cfg.WARM * cfg.GRAPH_LAYERS valid_loss = [[] for i in range(cfg.GRAPH_LAYERS+3)] logger.info("Experiment name: %s" % args.name) SHA = os.popen('git log -1 | head -n 1 | cut -c 8-13').readline().rstrip() logger.info('Git SHA: %s' % SHA) while cnt_iter < cfg.MAX_ITER: print(cnt_iter, cfg.MAX_ITER) #dy.renew_cg() dy.renew_cg(immediate_compute = True, check_validity = True) cnt_iter += 1 indexes, masks, truth = train_batch.__next__() vectors = token_repre(indexes, True) #vectors = encoder(vectors, np.array(masks['1D']).T) #print(vectors.npvalue) #vectors= encoder(vectors, vectors, vectors, np.array(masks['1D']).T) #vectors= encoder(vectors, vectors, vectors, np.array(masks['1D']).T, cfg.RNN_DROP) vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, np.array(masks['1D']).T, False, True) loss, part_loss = decoder(vectors, masks, truth, cnt_iter, True, True) for i, l in enumerate([loss]+part_loss): valid_loss[i].append(l.value()) loss.backward() trainer.learning_rate = LR*cfg.LR_DECAY**(max(cnt_iter, 0)/cfg.LR_ANNEAL) #trainer.learning_rate = cfg.LR*cfg.LR_DECAY**(max(cnt_iter, 0)/cfg.LR_ANNEAL) trainer.update() if cnt_iter % cfg.VALID_ITER: continue # Validation for i in range(len(valid_loss)): valid_loss[i] = str(round(np.mean(valid_loss[i]), 2)) avg_loss = ', '.join(valid_loss) logger.info("") logger.info("Iter: %d-%d, Avg_loss: %s, LR (%f), Best (%d)" % (cnt_iter/cfg.VALID_ITER, cnt_iter, avg_loss, trainer.learning_rate, BEST_ITER)) valid_loss = [[] for i in range(cfg.GRAPH_LAYERS+3)] my_eval.clear('Valid') for indexes, masks, truth in valid_batch: dy.renew_cg() vectors = token_repre(indexes, False) vectors = encoder(vectors, np.array(masks['1D']).T) #vectors= encoder(vectors, vectors, vectors, np.array(masks['1D']).T) #vectors = encoder(vectors, vectors, vectors, np.array(masks['1D']).T, cfg.RNN_DROP) #vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, np.array(masks['1D']).T, False, False) pred = decoder(vectors, masks, None, cnt_iter, False, True) my_eval.add_truth('Valid', truth) my_eval.add_pred('Valid', pred) dy.save(cfg.LAST_FILE, [token_repre, encoder, decoder]) if my_eval.evaluation('Valid', cfg.PRED_DEV, cfg.DEV): BEST_ITER = cnt_iter/cfg.VALID_ITER os.system('cp %s.data %s.data' % (cfg.LAST_FILE, cfg.BEST_FILE)) os.system('cp %s.meta %s.meta' % (cfg.LAST_FILE, cfg.BEST_FILE)) # Just record test result my_eval.clear('Test') for indexes, masks, truth in test_batch: dy.renew_cg() vectors = token_repre(indexes, False) vectors = encoder(vectors, np.array(masks['1D']).T) #vectors= encoder(vectors, vectors, vectors, np.array(masks['1D']).T) #vectors = encoder(vectors, vectors, vectors, np.array(masks['1D']).T, cfg.RNN_DROP) #vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, np.array(masks['1D']).T, False, False) pred = decoder(vectors, masks, None, cnt_iter, False, True) my_eval.add_truth('Test', truth) my_eval.add_pred('Test', pred) my_eval.evaluation('Test', cfg.PRED_TEST, cfg.TEST) my_eval.print_best_result('Valid') # Final Test test_pc = dy.ParameterCollection() token_repre, encoder, decoder = dy.load(cfg.BEST_FILE, test_pc) my_eval.clear('Test') for indexes, masks, truth in test_batch: dy.renew_cg() vectors = token_repre(indexes, False) vectors = encoder(vectors, np.array(masks['1D']).T) #vectors= encoder(vectors, vectors, vectors, np.array(masks['1D']).T) #vectors = encoder(vectors, vectors, vectors, np.array(masks['1D']).T, cfg.RNN_DROP) #vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, np.array(masks['1D']).T, False, False) pred = decoder(vectors, masks, None, 0, False, True) my_eval.add_truth('Test', truth) my_eval.add_pred('Test', pred) my_eval.evaluation('Test', cfg.PRED_TEST, cfg.TEST)
def main(): # Configuration file processing argparser = argparse.ArgumentParser() argparser.add_argument('--config_file', default='../configs/debug.cfg') argparser.add_argument('--continue_training', action='store_true', help='Load model Continue Training') argparser.add_argument('--name', default='experiment', help='The name of the experiment.') argparser.add_argument('--model', default='s2s', help='s2s: seq2seq-head-selection-model' 's2tBFS: seq2tree-BFS-decoder-model' 's2tDFS: seq2tree-DFS-decoder-model') argparser.add_argument('--gpu', default='0', help='GPU ID (-1 to cpu)') args, extra_args = argparser.parse_known_args() cfg = IniConfigurator(args.config_file, extra_args) # Logger setting logger = dual_channel_logger( __name__, file_path=cfg.LOG_FILE, file_model='w', formatter='%(asctime)s - %(levelname)s - %(message)s', time_formatter='%m-%d %H:%M') from eval.script_evaluator import ScriptEvaluator # DyNet setting os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu import dynet_config dynet_config.set(mem=cfg.DYNET_MEM, random_seed=cfg.DYNET_SEED) dynet_config.set_gpu() import dynet as dy from models.token_representation import TokenRepresentation from antu.nn.dynet.seq2seq_encoders import DeepBiRNNBuilder, orthonormal_VanillaLSTMBuilder from models.graph_nn_decoder import GraphNNDecoder # Build the dataset of the training process # Build data reader data_reader = PTBReader( field_list=['word', 'tag', 'head', 'rel'], root='0\t**root**\t_\t**rcpos**\t**rpos**\t_\t0\t**rrel**\t_\t_', spacer=r'[\t]', ) # Build vocabulary with pretrained glove vocabulary = Vocabulary() g_word, _ = glove_reader(cfg.GLOVE) pretrained_vocabs = {'glove': g_word} vocabulary.extend_from_pretrained_vocab(pretrained_vocabs) # Setup datasets datasets_settings = { 'train': DatasetSetting(cfg.TRAIN, True), 'dev': DatasetSetting(cfg.DEV, False), 'test': DatasetSetting(cfg.TEST, False), } datasets = PTBDataset(vocabulary, datasets_settings, data_reader) counters = {'word': Counter(), 'tag': Counter(), 'rel': Counter()} datasets.build_dataset(counters, no_pad_namespace={'rel'}, no_unk_namespace={'rel'}) # Build model # Parameter pc = dy.ParameterCollection() trainer = dy.AdamTrainer(pc, alpha=cfg.LR, beta_1=cfg.ADAM_BETA1, beta_2=cfg.ADAM_BETA2, eps=cfg.EPS) # Token Representation Layer token_repre = TokenRepresentation(pc, cfg, datasets.vocabulary) # BiLSTM Encoder Layer encoder = DeepBiRNNBuilder(pc, cfg.ENC_LAYERS, token_repre.token_dim, cfg.ENC_H_DIM, orthonormal_VanillaLSTMBuilder) # GNN Decoder Layer decoder = GraphNNDecoder(pc, cfg, datasets.vocabulary) # PTB Evaluator my_eval = ScriptEvaluator(['Valid', 'Test'], datasets.vocabulary) # Build Training Batch def cmp(ins): return len(ins['word']) train_batch = datasets.get_batches('train', cfg.TRAIN_BATCH_SIZE, True, cmp, True) valid_batch = list( datasets.get_batches('dev', cfg.TEST_BATCH_SIZE, False, cmp, False)) test_batch = list( datasets.get_batches('test', cfg.TEST_BATCH_SIZE, False, cmp, False)) # Train model BEST_DEV_LAS = BEST_DEV_UAS = BEST_ITER = cnt_iter = 0 valid_loss = [[] for i in range(cfg.GRAPH_LAYERS + 3)] logger.info("Experiment name: %s" % args.name) SHA = os.popen('git log -1 | head -n 1 | cut -c 8-13').readline().rstrip() logger.info('Git SHA: %s' % SHA) while cnt_iter < cfg.MAX_ITER: dy.renew_cg() cnt_iter += 1 indexes, masks, truth = train_batch.__next__() vectors = token_repre(indexes, True) vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, np.array(masks['1D']).T, True) loss, part_loss = decoder(vectors, masks, truth, True, True) for i, l in enumerate([loss] + part_loss): valid_loss[i].append(l.value()) loss.backward() trainer.learning_rate = cfg.LR * cfg.LR_DECAY**(cnt_iter / cfg.LR_ANNEAL) trainer.update() if cnt_iter % cfg.VALID_ITER: continue # Validation for i in range(len(valid_loss)): valid_loss[i] = str(round(np.mean(valid_loss[i]), 2)) avg_loss = ', '.join(valid_loss) logger.info("") logger.info("Iter: %d-%d, Avg_loss: %s, LR (%f), Best (%d)" % (cnt_iter / cfg.VALID_ITER, cnt_iter, avg_loss, trainer.learning_rate, BEST_ITER)) valid_loss = [[] for i in range(cfg.GRAPH_LAYERS + 3)] my_eval.clear('Valid') for indexes, masks, truth in valid_batch: dy.renew_cg() vectors = token_repre(indexes, False) vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, np.array(masks['1D']).T, False) pred = decoder(vectors, masks, None, False, True) my_eval.add_truth('Valid', truth) my_eval.add_pred('Valid', pred) dy.save(cfg.LAST_FILE, [token_repre, encoder, decoder]) if my_eval.evaluation('Valid', cfg.PRED_DEV, cfg.DEV): BEST_ITER = cnt_iter / cfg.VALID_ITER os.system('cp %s.data %s.data' % (cfg.LAST_FILE, cfg.BEST_FILE)) os.system('cp %s.meta %s.meta' % (cfg.LAST_FILE, cfg.BEST_FILE)) # Just record test result my_eval.clear('Test') for indexes, masks, truth in test_batch: dy.renew_cg() vectors = token_repre(indexes, False) vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, np.array(masks['1D']).T, False) pred = decoder(vectors, masks, None, False, True) my_eval.add_truth('Test', truth) my_eval.add_pred('Test', pred) my_eval.evaluation('Test', cfg.PRED_TEST, cfg.TEST) my_eval.print_best_result('Valid') test_pc = dy.ParameterCollection() token_repre, encoder, decoder = dy.load(cfg.BEST_FILE, test_pc) my_eval.clear('Test') test_batch = datasets.get_batches('test', cfg.TEST_BATCH_SIZE, False, cmp, False) for indexes, masks, truth in test_batch: dy.renew_cg() vectors = token_repre(indexes, False) vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, np.array(masks['1D']).T, False) pred = decoder(vectors, masks, None, False, True) my_eval.add_truth('Test', truth) my_eval.add_pred('Test', pred) my_eval.evaluation('Test', cfg.PRED_TEST, cfg.TEST)
def test_vocabulary(self): pretrained_vocabs = { 'glove': ['a', 'b', 'c'], 'w2v': ['b', 'c', 'd'], 'glove_nounk': ['a', 'b', 'c'], 'glove_nounk_nopad': ['a', 'b', 'c'] } counters = { 'w': Counter(["This", "is", "a", "test", "sentence", '.']), 'w_m': Counter(['This', 'is', 'is']), 'w_nounk': Counter(['This', 'is']), 'w_nounk_nopad': Counter(['This', 'is', 'a']) } vocab = Vocabulary( counters=counters, min_count={'w_m': 2}, pretrained_vocab=pretrained_vocabs, intersection_vocab={'w2v': 'glove'}, no_pad_namespace={'glove_nounk_nopad', 'w_nounk_nopad'}, no_unk_namespace={ 'glove_nounk', 'w_nounk', 'glove_nounk_nopad', 'w_nounk_nopad' }) # Test glove print(vocab.get_vocab_size('glove')) assert vocab.get_token_index('a', 'glove') == 2 assert vocab.get_token_index('c', 'glove') == 4 assert vocab.get_token_index('d', 'glove') == 0 # Test w2v assert vocab.get_token_index('b', 'w2v') == 2 assert vocab.get_token_index('d', 'w2v') == 0 assert vocab.get_token_from_index(2, 'w2v') == 'b' with pytest.raises(RuntimeError) as excinfo: vocab.get_token_from_index(4, 'w2v') assert excinfo.type == RuntimeError # Test glove_nounk assert vocab.get_token_index('a', 'glove_nounk') == 1 assert vocab.get_token_index('c', 'glove_nounk') == 3 with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('d', 'glove_nounk') assert excinfo.type == RuntimeError # Test glove_nounk_nopad assert vocab.get_token_index('a', 'glove_nounk_nopad') == 0 assert vocab.get_token_index('c', 'glove_nounk_nopad') == 2 with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('d', 'glove_nounk_nopad') assert excinfo.type == RuntimeError # Test w assert vocab.get_token_index('a', 'w') == 4 assert vocab.get_token_index('.', 'w') == 7 assert vocab.get_token_index('That', 'w') == 0 # Test w_m assert vocab.get_token_index('is', 'w_m') == 2 assert vocab.get_token_index('This', 'w_m') == 0 assert vocab.get_token_index('That', 'w_m') == 0 # Test w_nounk with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('That', 'w_nounk') assert excinfo.type == RuntimeError assert vocab.get_token_index('This', 'w_nounk') == 1 # Test w_nounk_nopad with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('That', 'w_nounk_nopad') assert excinfo.type == RuntimeError assert vocab.get_token_index('This', 'w_nounk_nopad') == 0
def test_extend_from_pretrained_vocab(self): vocab = Vocabulary() # Test extend a vocabulary from a simple pretained vocab pretrained_vocabs = {'glove': ['a', 'b', 'c']} vocab.extend_from_pretrained_vocab(pretrained_vocabs) assert vocab.get_token_index('a', 'glove') == 2 assert vocab.get_token_index('c', 'glove') == 4 assert vocab.get_token_index('d', 'glove') == 0 # Test extend a vocabulary from a pretained vocabulary, # and intersect with another vocabulary. pretrained_vocabs = {'w2v': ['b', 'c', 'd']} vocab.extend_from_pretrained_vocab(pretrained_vocabs, {'w2v': 'glove'}) assert vocab.get_token_index('b', 'w2v') == 2 assert vocab.get_token_index('d', 'w2v') == 0 assert vocab.get_token_from_index(2, 'w2v') == 'b' with pytest.raises(RuntimeError) as excinfo: vocab.get_token_from_index(4, 'w2v') assert excinfo.type == RuntimeError # Test extend a vocabulary from a no oov pretained vocabulary pretrained_vocabs = {'glove_nounk': ['a', 'b', 'c']} vocab.extend_from_pretrained_vocab(pretrained_vocabs, no_unk_namespace={ 'glove_nounk', }) assert vocab.get_token_index('a', 'glove_nounk') == 1 assert vocab.get_token_index('c', 'glove_nounk') == 3 with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('d', 'glove_nounk') assert excinfo.type == RuntimeError # Test extend a vocabulary from a no oov and pad pretained vocabulary pretrained_vocabs = {'glove_nounk_nopad': ['a', 'b', 'c']} vocab.extend_from_pretrained_vocab( pretrained_vocabs, no_unk_namespace={ 'glove_nounk_nopad', }, no_pad_namespace={"glove_nounk_nopad"}) assert vocab.get_token_index('a', 'glove_nounk_nopad') == 0 assert vocab.get_token_index('c', 'glove_nounk_nopad') == 2 with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('d', 'glove_nounk_nopad') assert excinfo.type == RuntimeError
def test_extend_from_counter(self): vocab = Vocabulary() # Test extend a vocabulary from a simple counter counter = {'w': Counter(["This", "is", "a", "test", "sentence", '.'])} vocab.extend_from_counter(counter) assert vocab.get_token_index('a', 'w') == 4 assert vocab.get_token_index('.', 'w') == 7 assert vocab.get_token_index('That', 'w') == 0 # Test extend a vocabulary from a counter with min_count counter = {'w_m': Counter(['This', 'is', 'is'])} min_count = {'w_m': 2} vocab.extend_from_counter(counter, min_count) assert vocab.get_token_index('is', 'w_m') == 2 assert vocab.get_token_index('This', 'w_m') == 0 assert vocab.get_token_index('That', 'w_m') == 0 # Test extend a vocabulary from a counter without oov token counter = {'w_nounk': Counter(['This', 'is'])} vocab.extend_from_counter(counter, no_unk_namespace={ 'w_nounk', }) with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('That', 'w_nounk') assert excinfo.type == RuntimeError assert vocab.get_token_index('This', 'w_nounk') == 1 # Test extend a vocabulary from a counter without pad & unk token counter = {'w_nounk_nopad': Counter(['This', 'is', 'a'])} vocab.extend_from_counter(counter, no_unk_namespace={'w_nounk_nopad'}, no_pad_namespace={'w_nounk_nopad'}) with pytest.raises(RuntimeError) as excinfo: vocab.get_token_index('That', 'w_nounk_nopad') assert excinfo.type == RuntimeError assert vocab.get_token_index('This', 'w_nounk_nopad') == 0
def main(): # Configuration file processing argparser = argparse.ArgumentParser() argparser.add_argument('--config_file', default='../configs/debug.cfg') argparser.add_argument('--continue_training', action='store_true', help='Load model Continue Training') argparser.add_argument('--name', default='experiment', help='The name of the experiment.') argparser.add_argument('--model', default='s2s', help='s2s: seq2seq-head-selection-model' 's2tBFS: seq2tree-BFS-decoder-model' 's2tDFS: seq2tree-DFS-decoder-model') argparser.add_argument('--gpu', default='0', help='GPU ID (-1 to cpu)') args, extra_args = argparser.parse_known_args() cfg = IniConfigurator(args.config_file, extra_args) # Logger setting logger = dual_channel_logger( __name__, file_path=cfg.LOG_FILE, file_model='w', formatter='%(asctime)s - %(levelname)s - %(message)s', time_formatter='%m-%d %H:%M') from eval.script_evaluator import ScriptEvaluator # DyNet setting os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu import dynet_config dynet_config.set(mem=cfg.DYNET_MEM, random_seed=cfg.DYNET_SEED) dynet_config.set_gpu() import dynet as dy # Build the dataset of the training process # Build data reader data_reader = PTBReader( field_list=['word', 'tag', 'head', 'rel'], root='0\t**root**\t_\t**rcpos**\t**rpos**\t_\t0\t**rrel**\t_\t_', spacer=r'[\t]',) # Build vocabulary with pretrained glove vocabulary = Vocabulary() g_word, _ = glove_reader(cfg.GLOVE) pretrained_vocabs = {'glove': g_word} vocabulary.extend_from_pretrained_vocab(pretrained_vocabs) # Setup datasets datasets_settings = {'train': DatasetSetting(cfg.TRAIN, True), 'dev': DatasetSetting(cfg.DEV, False), 'test': DatasetSetting(cfg.TEST, False), } datasets = PTBDataset(vocabulary, datasets_settings, data_reader) counters = {'word': Counter(), 'tag': Counter(), 'rel': Counter()} datasets.build_dataset(counters, no_pad_namespace={'rel'}, no_unk_namespace={'rel'}) logger.info("Experiment name: %s" % args.name) SHA = os.popen('git log -1 | head -n 1 | cut -c 8-13').readline().rstrip() logger.info('Git SHA: %s' % SHA) # Build Test model test_pc = dy.ParameterCollection() token_repre, encoder, decoder = dy.load(cfg.BEST_FILE, test_pc) # PTB Evaluator my_eval = ScriptEvaluator(['Valid', 'Test'], datasets.vocabulary) my_eval.clear('Test') def cmp(ins): return len(ins['word']) test_batch = datasets.get_batches('test', cfg.TEST_BATCH_SIZE, False, cmp, False) for indexes, masks, truth in test_batch: dy.renew_cg() vectors = token_repre(indexes, False) vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, False) pred = decoder(vectors, masks, None, False, True) my_eval.add_truth('Test', truth) my_eval.add_pred('Test', pred) my_eval.evaluation('Test', cfg.PRED_TEST, cfg.TEST)