Beispiel #1
0
    def __init__(self, model: dy.ParameterCollection, cfg: IniConfigurator,
                 vocabulary: Vocabulary):

        pc = model.add_subcollection()
        # MLP layer
        orth_init = OrthogonalInitializer
        self.head_arc_MLP = MLP(pc, cfg.ARC_MLP_SIZE, leaky_relu, cfg.MLP_DROP,
                                cfg.MLP_BIAS, orth_init)
        self.head_rel_MLP = MLP(pc, cfg.REL_MLP_SIZE, leaky_relu, cfg.MLP_DROP,
                                cfg.MLP_BIAS, orth_init)
        self.dept_arc_MLP = MLP(pc, cfg.ARC_MLP_SIZE, leaky_relu, cfg.MLP_DROP,
                                cfg.MLP_BIAS, orth_init)
        self.dept_rel_MLP = MLP(pc, cfg.REL_MLP_SIZE, leaky_relu, cfg.MLP_DROP,
                                cfg.MLP_BIAS, orth_init)

        # Biaffine Attention Layer (Arc)
        arc_size = cfg.ARC_MLP_SIZE[-1]
        zero_init = dy.ConstInitializer(0)
        self.arc_attn_mat = [
            BiaffineMatAttention(pc, arc_size, arc_size, 1, True, False,
                                 zero_init)
            for _ in range(cfg.GRAPH_LAYERS + 1)
        ]

        # Biaffine Attention Layer (Rel)
        rel_num = vocabulary.get_vocab_size('rel')
        rel_size = cfg.REL_MLP_SIZE[-1]
        self.rel_mask = np.array([1] + [0] *
                                 (rel_num - 1))  # mask root relation
        self.rel_attn = BiaffineMatAttention(pc, rel_size, rel_size, rel_num,
                                             True, True, zero_init)

        # Graph Network Layer
        self.head_gnn = GraphNNUnit(pc, arc_size, arc_size, leaky_relu,
                                    orth_init)
        self.dept_gnn = GraphNNUnit(pc, arc_size, arc_size, leaky_relu,
                                    orth_init)
        self.head_rel_gnn = GraphNNUnit(pc, rel_size, rel_size, leaky_relu,
                                        orth_init)
        self.dept_rel_gnn = GraphNNUnit(pc, rel_size, rel_size, leaky_relu,
                                        orth_init)

        # Graph Layer WarmUp
        self.warm_list = [
            -i * cfg.WARM for i in range(cfg.GRAPH_LAYERS, -1, -1)
        ]

        # Save Variable
        self.arc_size, self.rel_size, self.rel_num = arc_size, rel_size, rel_num
        self.pc, self.cfg = pc, cfg
        self.spec = (cfg, vocabulary)
Beispiel #2
0
    def test_single_id_token_indexer(self):
        sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence']
        counter = {'my_word': Counter()}
        vocab = Vocabulary()
        glove = ['This', 'is', 'glove', 'sentence', 'vocabulary']
        vocab.extend_from_pretrained_vocab({'glove': glove})
        indexer = SingleIdTokenIndexer(['my_word', 'glove'])
        sent = TextField('sentence', sentence, [indexer])

        # Test count_vocab_items()
        sent.count_vocab_items(counter)
        assert counter['my_word']['This'] == 1
        assert counter['my_word']['is'] == 2
        assert counter['my_word']['That'] == 0

        vocab.extend_from_counter(counter)

        # Test index()
        sent.index(vocab)
        assert sent.indexes['glove'] == [2, 3, 3, 0, 0, 0, 5]
        assert sent.indexes['my_word'] == [2, 3, 3, 4, 4, 5, 6]
Beispiel #3
0
    def test_char_token_indexer(self):
        sentence = ['This', 'is', 'is', 'a', 'a', 'test', 'sentence']
        counter = {'my_char': Counter()}
        vocab = Vocabulary()
        glove = ['a', 'b', 'c', 'd', 'e']
        vocab.extend_from_pretrained_vocab({'glove': glove})
        indexer = CharTokenIndexer(['my_char', 'glove'])
        sent = TextField('sentence', sentence, [indexer])

        # Test count_vocab_items()
        sent.count_vocab_items(counter)
        assert counter['my_char']['s'] == 5
        assert counter['my_char']['T'] == 1
        assert counter['my_char']['t'] == 3
        assert counter['my_char']['A'] == 0

        vocab.extend_from_counter(counter)

        # Test index()
        sent.index(vocab)
        assert sent.indexes['glove'][0] == [0, 0, 0, 0]  # 'This'
        assert sent.indexes['glove'][3] == [2]  # 'a'
        assert sent.indexes['my_char'][0] == [2, 3, 4, 5]  # 'This'
Beispiel #4
0
def main():
    # Configuration file processing
    argparser = argparse.ArgumentParser()
    argparser.add_argument('--config_file', default='../configs/debug.cfg')
    argparser.add_argument('--continue_training', action='store_true',
                           help='Load model Continue Training')
    argparser.add_argument('--name', default='experiment',
                           help='The name of the experiment.')
    argparser.add_argument('--model', default='s2s',
                           help='s2s: seq2seq-head-selection-model'
                           's2tDFS: seq2tree-DFS-decoder-model')
    argparser.add_argument('--gpu', default='0', help='GPU ID (-1 to cpu)')
    args, extra_args = argparser.parse_known_args()
    cfg = IniConfigurator(args.config_file, extra_args)

    # Logger setting
    logger = dual_channel_logger(
        __name__,
        file_path=cfg.LOG_FILE,
        file_model='w',
        formatter='%(asctime)s - %(levelname)s - %(message)s',
        time_formatter='%m-%d %H:%M')
    from eval.script_evaluator import ScriptEvaluator

    # DyNet setting
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    import dynet_config
    dynet_config.set(mem=cfg.DYNET_MEM, random_seed=cfg.DYNET_SEED)
    dynet_config.set_gpu()
    import dynet as dy
    from models.token_representation import TokenRepresentation
    from antu.nn.dynet.seq2seq_encoders import DeepBiRNNBuilder, orthonormal_VanillaLSTMBuilder
    from models.graph_nn_decoder import GraphNNDecoder
    from models.jackknife_decoder import JackKnifeGraphNNDecoder

    

    # Build the dataset of the training process
    # Build data reader
    data_reader = PTBReader(
        field_list=['word', 'tag', 'head', 'rel'],
        root='0\t**root**\t_\t**rcpos**\t**rpos**\t_\t0\t**rrel**\t_\t_',
        spacer=r'[\t]',)
    # Build vocabulary with pretrained glove
    vocabulary = Vocabulary()
    g_word, _ = glove_reader(cfg.GLOVE)
    pretrained_vocabs = {'glove': g_word}
    vocabulary.extend_from_pretrained_vocab(pretrained_vocabs)
    # Setup datasets
    datasets_settings = {'train': DatasetSetting(cfg.TRAIN, True),
                         'dev': DatasetSetting(cfg.DEV, False),
                         'test': DatasetSetting(cfg.TEST, False), }
    datasets = PTBDataset(vocabulary, datasets_settings, data_reader)
    counters = {'word': Counter(), 'tag': Counter(), 'rel': Counter()}
    datasets.build_dataset(counters, no_pad_namespace={'rel'}, no_unk_namespace={'rel'})

    # Build model
    # Parameter
    pc = dy.ParameterCollection()
    LR = 0.0005
    trainer = dy.AdamTrainer(pc, LR, cfg.ADAM_BETA1, cfg.ADAM_BETA2, cfg.EPS)

    # Token Representation Layer
    token_repre = TokenRepresentation(pc, cfg, datasets.vocabulary, include_pos=True)
    # BiLSTM Encoder Layer
    #encoder = BiaffineAttention()
    #encoder = MultiHeadedAttention(pc, 10, token_repre.token_dim)
    #encoder = MultiLayerMultiHeadAttention(pc, 10, token_repre.token_dim, num_layers=1)
    #encoder = MyMultiHeadAttention(None, 6, token_repre.token_dim, 32, 32, model=pc)
    
    #encoder = LabelAttention(None, token_repre.token_dim, 128, 128, 112, 128, use_resdrop=True, q_as_matrix=False, residual_dropout=0.1, attention_dropout=0.1, d_positional=None, model=pc)
    # encoder = Encoder(None, token_repre.token_dim,
    #                 num_layers=1, num_heads=2, d_kv = 32, d_ff=1024, d_l=112,
    #                 d_positional=None,
    #                 num_layers_position_only=0,
    #                 relu_dropout=0.1, residual_dropout=0.1, attention_dropout=0.1,
    #                 use_lal=True,
    #                 lal_d_kv=128,
    #                 lal_d_proj=128,
    #                 lal_resdrop=True,
    #                 lal_pwff=True,
    #                 lal_q_as_matrix=False,
    #                 lal_partitioned=True,
    #                 model=pc)
    #encoder = ScaledDotProductAttention(pc, 10)
    encoder = DeepBiRNNBuilder(pc, cfg.ENC_LAYERS, token_repre.token_dim, cfg.ENC_H_DIM, orthonormal_VanillaLSTMBuilder)
    # GNN Decoder Layer
    decoder = GraphNNDecoder(pc, cfg, datasets.vocabulary)

    #decoder = JackKnifeGraphNNDecoder(pc, cfg, datasets.vocabulary)
    # PTB Evaluator
    my_eval = ScriptEvaluator(['Valid', 'Test'], datasets.vocabulary)

    #dy.save(cfg.LAST_FILE, [token_repre, encoder, decoder])
    #exit(0)

    # Build Training Batch
    def cmp(ins):
        return len(ins['word'])
    train_batch = datasets.get_batches('train', cfg.TRAIN_BATCH_SIZE, True, cmp, True)
    valid_batch = list(datasets.get_batches('dev', cfg.TEST_BATCH_SIZE, False, cmp, False))
    test_batch = list(datasets.get_batches('test', cfg.TEST_BATCH_SIZE, False, cmp, False))

    #print('-----------------------')
    # print('TRAIN BATCH IS: ')
    # # print(train_batch)
    # indexes, masks, truth = train_batch.__next__()
    # print(indexes)
    # print('------------------',end='\n\n\n\n\n\n\n')
    # print(len(indexes))
    # exit(0)
    # exit(0)
    # for k in indexes:
    #     print(k)
    #print(indexes)
    #print(masks)


    # Train model
    BEST_DEV_LAS = BEST_DEV_UAS = BEST_ITER = 0
    cnt_iter = -cfg.WARM * cfg.GRAPH_LAYERS
    valid_loss = [[] for i in range(cfg.GRAPH_LAYERS+3)]
    logger.info("Experiment name: %s" % args.name)
    SHA = os.popen('git log -1 | head -n 1 | cut -c 8-13').readline().rstrip()
    logger.info('Git SHA: %s' % SHA)
    while cnt_iter < cfg.MAX_ITER:
        print(cnt_iter, cfg.MAX_ITER)
        #dy.renew_cg()
        dy.renew_cg(immediate_compute = True, check_validity = True)
        cnt_iter += 1
        indexes, masks, truth = train_batch.__next__()
        vectors = token_repre(indexes, True)
        
        

        #vectors = encoder(vectors, np.array(masks['1D']).T)
        
        #print(vectors.npvalue)
        #vectors= encoder(vectors, vectors, vectors, np.array(masks['1D']).T)
        #vectors= encoder(vectors, vectors, vectors, np.array(masks['1D']).T, cfg.RNN_DROP)

        vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, np.array(masks['1D']).T, False, True)
       
        loss, part_loss = decoder(vectors, masks, truth, cnt_iter, True, True)
        for i, l in enumerate([loss]+part_loss):
            valid_loss[i].append(l.value())
        loss.backward()
        trainer.learning_rate = LR*cfg.LR_DECAY**(max(cnt_iter, 0)/cfg.LR_ANNEAL)
        #trainer.learning_rate = cfg.LR*cfg.LR_DECAY**(max(cnt_iter, 0)/cfg.LR_ANNEAL)
        trainer.update()

        if cnt_iter % cfg.VALID_ITER: continue
        # Validation
        for i in range(len(valid_loss)):
            valid_loss[i] = str(round(np.mean(valid_loss[i]), 2))
        avg_loss = ', '.join(valid_loss)
        logger.info("")
        logger.info("Iter: %d-%d, Avg_loss: %s, LR (%f), Best (%d)" %
                    (cnt_iter/cfg.VALID_ITER, cnt_iter, avg_loss,
                     trainer.learning_rate, BEST_ITER))

        valid_loss = [[] for i in range(cfg.GRAPH_LAYERS+3)]
        my_eval.clear('Valid')
        for indexes, masks, truth in valid_batch:
            dy.renew_cg()
            vectors = token_repre(indexes, False)

            vectors = encoder(vectors, np.array(masks['1D']).T)
            #vectors= encoder(vectors, vectors, vectors, np.array(masks['1D']).T)
            #vectors = encoder(vectors, vectors, vectors, np.array(masks['1D']).T, cfg.RNN_DROP)
            #vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, np.array(masks['1D']).T, False, False)

            pred = decoder(vectors, masks, None, cnt_iter, False, True)
            my_eval.add_truth('Valid', truth)
            my_eval.add_pred('Valid', pred)
        dy.save(cfg.LAST_FILE, [token_repre, encoder, decoder])
        if my_eval.evaluation('Valid', cfg.PRED_DEV, cfg.DEV):
            BEST_ITER = cnt_iter/cfg.VALID_ITER
            os.system('cp %s.data %s.data' % (cfg.LAST_FILE, cfg.BEST_FILE))
            os.system('cp %s.meta %s.meta' % (cfg.LAST_FILE, cfg.BEST_FILE))

        # Just record test result
        my_eval.clear('Test')
        for indexes, masks, truth in test_batch:
            dy.renew_cg()
            vectors = token_repre(indexes, False)

            vectors = encoder(vectors, np.array(masks['1D']).T)
            #vectors= encoder(vectors, vectors, vectors, np.array(masks['1D']).T)
            #vectors = encoder(vectors, vectors, vectors, np.array(masks['1D']).T, cfg.RNN_DROP)
            #vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, np.array(masks['1D']).T, False, False)

            pred = decoder(vectors, masks, None, cnt_iter, False, True)
            my_eval.add_truth('Test', truth)
            my_eval.add_pred('Test', pred)
        my_eval.evaluation('Test', cfg.PRED_TEST, cfg.TEST)
    my_eval.print_best_result('Valid')

    # Final Test
    test_pc = dy.ParameterCollection()
    token_repre, encoder, decoder = dy.load(cfg.BEST_FILE, test_pc)
    my_eval.clear('Test')
    for indexes, masks, truth in test_batch:
        dy.renew_cg()
        vectors = token_repre(indexes, False)

        vectors = encoder(vectors, np.array(masks['1D']).T)
        #vectors= encoder(vectors, vectors, vectors, np.array(masks['1D']).T)
        #vectors = encoder(vectors, vectors, vectors, np.array(masks['1D']).T, cfg.RNN_DROP)
        #vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, np.array(masks['1D']).T, False, False)

        pred = decoder(vectors, masks, None, 0, False, True)
        my_eval.add_truth('Test', truth)
        my_eval.add_pred('Test', pred)
    my_eval.evaluation('Test', cfg.PRED_TEST, cfg.TEST)
Beispiel #5
0
def main():
    # Configuration file processing
    argparser = argparse.ArgumentParser()
    argparser.add_argument('--config_file', default='../configs/debug.cfg')
    argparser.add_argument('--continue_training',
                           action='store_true',
                           help='Load model Continue Training')
    argparser.add_argument('--name',
                           default='experiment',
                           help='The name of the experiment.')
    argparser.add_argument('--model',
                           default='s2s',
                           help='s2s: seq2seq-head-selection-model'
                           's2tBFS: seq2tree-BFS-decoder-model'
                           's2tDFS: seq2tree-DFS-decoder-model')
    argparser.add_argument('--gpu', default='0', help='GPU ID (-1 to cpu)')
    args, extra_args = argparser.parse_known_args()
    cfg = IniConfigurator(args.config_file, extra_args)

    # Logger setting
    logger = dual_channel_logger(
        __name__,
        file_path=cfg.LOG_FILE,
        file_model='w',
        formatter='%(asctime)s - %(levelname)s - %(message)s',
        time_formatter='%m-%d %H:%M')
    from eval.script_evaluator import ScriptEvaluator

    # DyNet setting
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    import dynet_config
    dynet_config.set(mem=cfg.DYNET_MEM, random_seed=cfg.DYNET_SEED)
    dynet_config.set_gpu()
    import dynet as dy
    from models.token_representation import TokenRepresentation
    from antu.nn.dynet.seq2seq_encoders import DeepBiRNNBuilder, orthonormal_VanillaLSTMBuilder
    from models.graph_nn_decoder import GraphNNDecoder

    # Build the dataset of the training process
    # Build data reader
    data_reader = PTBReader(
        field_list=['word', 'tag', 'head', 'rel'],
        root='0\t**root**\t_\t**rcpos**\t**rpos**\t_\t0\t**rrel**\t_\t_',
        spacer=r'[\t]',
    )
    # Build vocabulary with pretrained glove
    vocabulary = Vocabulary()
    g_word, _ = glove_reader(cfg.GLOVE)
    pretrained_vocabs = {'glove': g_word}
    vocabulary.extend_from_pretrained_vocab(pretrained_vocabs)
    # Setup datasets
    datasets_settings = {
        'train': DatasetSetting(cfg.TRAIN, True),
        'dev': DatasetSetting(cfg.DEV, False),
        'test': DatasetSetting(cfg.TEST, False),
    }
    datasets = PTBDataset(vocabulary, datasets_settings, data_reader)
    counters = {'word': Counter(), 'tag': Counter(), 'rel': Counter()}
    datasets.build_dataset(counters,
                           no_pad_namespace={'rel'},
                           no_unk_namespace={'rel'})

    # Build model
    # Parameter
    pc = dy.ParameterCollection()
    trainer = dy.AdamTrainer(pc,
                             alpha=cfg.LR,
                             beta_1=cfg.ADAM_BETA1,
                             beta_2=cfg.ADAM_BETA2,
                             eps=cfg.EPS)

    # Token Representation Layer
    token_repre = TokenRepresentation(pc, cfg, datasets.vocabulary)
    # BiLSTM Encoder Layer
    encoder = DeepBiRNNBuilder(pc, cfg.ENC_LAYERS, token_repre.token_dim,
                               cfg.ENC_H_DIM, orthonormal_VanillaLSTMBuilder)
    # GNN Decoder Layer
    decoder = GraphNNDecoder(pc, cfg, datasets.vocabulary)
    # PTB Evaluator
    my_eval = ScriptEvaluator(['Valid', 'Test'], datasets.vocabulary)

    # Build Training Batch
    def cmp(ins):
        return len(ins['word'])

    train_batch = datasets.get_batches('train', cfg.TRAIN_BATCH_SIZE, True,
                                       cmp, True)
    valid_batch = list(
        datasets.get_batches('dev', cfg.TEST_BATCH_SIZE, False, cmp, False))
    test_batch = list(
        datasets.get_batches('test', cfg.TEST_BATCH_SIZE, False, cmp, False))

    # Train model
    BEST_DEV_LAS = BEST_DEV_UAS = BEST_ITER = cnt_iter = 0
    valid_loss = [[] for i in range(cfg.GRAPH_LAYERS + 3)]
    logger.info("Experiment name: %s" % args.name)
    SHA = os.popen('git log -1 | head -n 1 | cut -c 8-13').readline().rstrip()
    logger.info('Git SHA: %s' % SHA)
    while cnt_iter < cfg.MAX_ITER:
        dy.renew_cg()
        cnt_iter += 1
        indexes, masks, truth = train_batch.__next__()
        vectors = token_repre(indexes, True)
        vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP,
                          np.array(masks['1D']).T, True)
        loss, part_loss = decoder(vectors, masks, truth, True, True)
        for i, l in enumerate([loss] + part_loss):
            valid_loss[i].append(l.value())
        loss.backward()
        trainer.learning_rate = cfg.LR * cfg.LR_DECAY**(cnt_iter /
                                                        cfg.LR_ANNEAL)
        trainer.update()

        if cnt_iter % cfg.VALID_ITER:
            continue

        # Validation
        for i in range(len(valid_loss)):
            valid_loss[i] = str(round(np.mean(valid_loss[i]), 2))
        avg_loss = ', '.join(valid_loss)
        logger.info("")
        logger.info("Iter: %d-%d, Avg_loss: %s, LR (%f), Best (%d)" %
                    (cnt_iter / cfg.VALID_ITER, cnt_iter, avg_loss,
                     trainer.learning_rate, BEST_ITER))

        valid_loss = [[] for i in range(cfg.GRAPH_LAYERS + 3)]
        my_eval.clear('Valid')
        for indexes, masks, truth in valid_batch:
            dy.renew_cg()
            vectors = token_repre(indexes, False)
            vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP,
                              np.array(masks['1D']).T, False)
            pred = decoder(vectors, masks, None, False, True)
            my_eval.add_truth('Valid', truth)
            my_eval.add_pred('Valid', pred)
        dy.save(cfg.LAST_FILE, [token_repre, encoder, decoder])
        if my_eval.evaluation('Valid', cfg.PRED_DEV, cfg.DEV):
            BEST_ITER = cnt_iter / cfg.VALID_ITER
            os.system('cp %s.data %s.data' % (cfg.LAST_FILE, cfg.BEST_FILE))
            os.system('cp %s.meta %s.meta' % (cfg.LAST_FILE, cfg.BEST_FILE))

        # Just record test result
        my_eval.clear('Test')
        for indexes, masks, truth in test_batch:
            dy.renew_cg()
            vectors = token_repre(indexes, False)
            vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP,
                              np.array(masks['1D']).T, False)
            pred = decoder(vectors, masks, None, False, True)
            my_eval.add_truth('Test', truth)
            my_eval.add_pred('Test', pred)
        my_eval.evaluation('Test', cfg.PRED_TEST, cfg.TEST)
    my_eval.print_best_result('Valid')

    test_pc = dy.ParameterCollection()
    token_repre, encoder, decoder = dy.load(cfg.BEST_FILE, test_pc)

    my_eval.clear('Test')
    test_batch = datasets.get_batches('test', cfg.TEST_BATCH_SIZE, False, cmp,
                                      False)
    for indexes, masks, truth in test_batch:
        dy.renew_cg()
        vectors = token_repre(indexes, False)
        vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP,
                          np.array(masks['1D']).T, False)
        pred = decoder(vectors, masks, None, False, True)
        my_eval.add_truth('Test', truth)
        my_eval.add_pred('Test', pred)
    my_eval.evaluation('Test', cfg.PRED_TEST, cfg.TEST)
Beispiel #6
0
    def test_vocabulary(self):
        pretrained_vocabs = {
            'glove': ['a', 'b', 'c'],
            'w2v': ['b', 'c', 'd'],
            'glove_nounk': ['a', 'b', 'c'],
            'glove_nounk_nopad': ['a', 'b', 'c']
        }

        counters = {
            'w': Counter(["This", "is", "a", "test", "sentence", '.']),
            'w_m': Counter(['This', 'is', 'is']),
            'w_nounk': Counter(['This', 'is']),
            'w_nounk_nopad': Counter(['This', 'is', 'a'])
        }

        vocab = Vocabulary(
            counters=counters,
            min_count={'w_m': 2},
            pretrained_vocab=pretrained_vocabs,
            intersection_vocab={'w2v': 'glove'},
            no_pad_namespace={'glove_nounk_nopad', 'w_nounk_nopad'},
            no_unk_namespace={
                'glove_nounk', 'w_nounk', 'glove_nounk_nopad', 'w_nounk_nopad'
            })

        # Test glove
        print(vocab.get_vocab_size('glove'))
        assert vocab.get_token_index('a', 'glove') == 2
        assert vocab.get_token_index('c', 'glove') == 4
        assert vocab.get_token_index('d', 'glove') == 0

        # Test w2v
        assert vocab.get_token_index('b', 'w2v') == 2
        assert vocab.get_token_index('d', 'w2v') == 0
        assert vocab.get_token_from_index(2, 'w2v') == 'b'
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_from_index(4, 'w2v')
        assert excinfo.type == RuntimeError

        # Test glove_nounk
        assert vocab.get_token_index('a', 'glove_nounk') == 1
        assert vocab.get_token_index('c', 'glove_nounk') == 3
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('d', 'glove_nounk')
        assert excinfo.type == RuntimeError

        # Test glove_nounk_nopad
        assert vocab.get_token_index('a', 'glove_nounk_nopad') == 0
        assert vocab.get_token_index('c', 'glove_nounk_nopad') == 2
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('d', 'glove_nounk_nopad')
        assert excinfo.type == RuntimeError

        # Test w
        assert vocab.get_token_index('a', 'w') == 4
        assert vocab.get_token_index('.', 'w') == 7
        assert vocab.get_token_index('That', 'w') == 0

        # Test w_m
        assert vocab.get_token_index('is', 'w_m') == 2
        assert vocab.get_token_index('This', 'w_m') == 0
        assert vocab.get_token_index('That', 'w_m') == 0

        # Test w_nounk
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('That', 'w_nounk')
        assert excinfo.type == RuntimeError
        assert vocab.get_token_index('This', 'w_nounk') == 1

        # Test w_nounk_nopad
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('That', 'w_nounk_nopad')
        assert excinfo.type == RuntimeError
        assert vocab.get_token_index('This', 'w_nounk_nopad') == 0
Beispiel #7
0
    def test_extend_from_pretrained_vocab(self):
        vocab = Vocabulary()

        # Test extend a vocabulary from a simple pretained vocab
        pretrained_vocabs = {'glove': ['a', 'b', 'c']}
        vocab.extend_from_pretrained_vocab(pretrained_vocabs)
        assert vocab.get_token_index('a', 'glove') == 2
        assert vocab.get_token_index('c', 'glove') == 4
        assert vocab.get_token_index('d', 'glove') == 0

        # Test extend a vocabulary from a pretained vocabulary,
        # and intersect with another vocabulary.
        pretrained_vocabs = {'w2v': ['b', 'c', 'd']}
        vocab.extend_from_pretrained_vocab(pretrained_vocabs, {'w2v': 'glove'})
        assert vocab.get_token_index('b', 'w2v') == 2
        assert vocab.get_token_index('d', 'w2v') == 0
        assert vocab.get_token_from_index(2, 'w2v') == 'b'
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_from_index(4, 'w2v')
        assert excinfo.type == RuntimeError

        # Test extend a vocabulary from a no oov pretained vocabulary
        pretrained_vocabs = {'glove_nounk': ['a', 'b', 'c']}
        vocab.extend_from_pretrained_vocab(pretrained_vocabs,
                                           no_unk_namespace={
                                               'glove_nounk',
                                           })
        assert vocab.get_token_index('a', 'glove_nounk') == 1
        assert vocab.get_token_index('c', 'glove_nounk') == 3
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('d', 'glove_nounk')
        assert excinfo.type == RuntimeError

        # Test extend a vocabulary from a no oov and pad pretained vocabulary
        pretrained_vocabs = {'glove_nounk_nopad': ['a', 'b', 'c']}
        vocab.extend_from_pretrained_vocab(
            pretrained_vocabs,
            no_unk_namespace={
                'glove_nounk_nopad',
            },
            no_pad_namespace={"glove_nounk_nopad"})
        assert vocab.get_token_index('a', 'glove_nounk_nopad') == 0
        assert vocab.get_token_index('c', 'glove_nounk_nopad') == 2
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('d', 'glove_nounk_nopad')
        assert excinfo.type == RuntimeError
Beispiel #8
0
    def test_extend_from_counter(self):
        vocab = Vocabulary()

        # Test extend a vocabulary from a simple counter
        counter = {'w': Counter(["This", "is", "a", "test", "sentence", '.'])}
        vocab.extend_from_counter(counter)
        assert vocab.get_token_index('a', 'w') == 4
        assert vocab.get_token_index('.', 'w') == 7
        assert vocab.get_token_index('That', 'w') == 0

        # Test extend a vocabulary from a counter with min_count
        counter = {'w_m': Counter(['This', 'is', 'is'])}
        min_count = {'w_m': 2}
        vocab.extend_from_counter(counter, min_count)
        assert vocab.get_token_index('is', 'w_m') == 2
        assert vocab.get_token_index('This', 'w_m') == 0
        assert vocab.get_token_index('That', 'w_m') == 0

        # Test extend a vocabulary from a counter without oov token
        counter = {'w_nounk': Counter(['This', 'is'])}
        vocab.extend_from_counter(counter, no_unk_namespace={
            'w_nounk',
        })
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('That', 'w_nounk')
        assert excinfo.type == RuntimeError
        assert vocab.get_token_index('This', 'w_nounk') == 1

        # Test extend a vocabulary from a counter without pad & unk token
        counter = {'w_nounk_nopad': Counter(['This', 'is', 'a'])}
        vocab.extend_from_counter(counter,
                                  no_unk_namespace={'w_nounk_nopad'},
                                  no_pad_namespace={'w_nounk_nopad'})
        with pytest.raises(RuntimeError) as excinfo:
            vocab.get_token_index('That', 'w_nounk_nopad')
        assert excinfo.type == RuntimeError
        assert vocab.get_token_index('This', 'w_nounk_nopad') == 0
Beispiel #9
0
def main():
    # Configuration file processing
    argparser = argparse.ArgumentParser()
    argparser.add_argument('--config_file', default='../configs/debug.cfg')
    argparser.add_argument('--continue_training', action='store_true',
                           help='Load model Continue Training')
    argparser.add_argument('--name', default='experiment',
                           help='The name of the experiment.')
    argparser.add_argument('--model', default='s2s',
                           help='s2s: seq2seq-head-selection-model'
                           's2tBFS: seq2tree-BFS-decoder-model'
                           's2tDFS: seq2tree-DFS-decoder-model')
    argparser.add_argument('--gpu', default='0', help='GPU ID (-1 to cpu)')
    args, extra_args = argparser.parse_known_args()
    cfg = IniConfigurator(args.config_file, extra_args)

    # Logger setting
    logger = dual_channel_logger(
        __name__,
        file_path=cfg.LOG_FILE,
        file_model='w',
        formatter='%(asctime)s - %(levelname)s - %(message)s',
        time_formatter='%m-%d %H:%M')
    from eval.script_evaluator import ScriptEvaluator

    # DyNet setting
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    import dynet_config
    dynet_config.set(mem=cfg.DYNET_MEM, random_seed=cfg.DYNET_SEED)
    dynet_config.set_gpu()
    import dynet as dy

    # Build the dataset of the training process
    # Build data reader
    data_reader = PTBReader(
        field_list=['word', 'tag', 'head', 'rel'],
        root='0\t**root**\t_\t**rcpos**\t**rpos**\t_\t0\t**rrel**\t_\t_',
        spacer=r'[\t]',)
    # Build vocabulary with pretrained glove
    vocabulary = Vocabulary()
    g_word, _ = glove_reader(cfg.GLOVE)
    pretrained_vocabs = {'glove': g_word}
    vocabulary.extend_from_pretrained_vocab(pretrained_vocabs)
    # Setup datasets
    datasets_settings = {'train': DatasetSetting(cfg.TRAIN, True),
                         'dev': DatasetSetting(cfg.DEV, False),
                         'test': DatasetSetting(cfg.TEST, False), }
    datasets = PTBDataset(vocabulary, datasets_settings, data_reader)
    counters = {'word': Counter(), 'tag': Counter(), 'rel': Counter()}
    datasets.build_dataset(counters, no_pad_namespace={'rel'},
                           no_unk_namespace={'rel'})

    logger.info("Experiment name: %s" % args.name)
    SHA = os.popen('git log -1 | head -n 1 | cut -c 8-13').readline().rstrip()
    logger.info('Git SHA: %s' % SHA)

    # Build Test model
    test_pc = dy.ParameterCollection()
    token_repre, encoder, decoder = dy.load(cfg.BEST_FILE, test_pc)

    # PTB Evaluator
    my_eval = ScriptEvaluator(['Valid', 'Test'], datasets.vocabulary)
    my_eval.clear('Test')

    def cmp(ins):
        return len(ins['word'])
    test_batch = datasets.get_batches('test', cfg.TEST_BATCH_SIZE, False, cmp,
                                      False)
    for indexes, masks, truth in test_batch:
        dy.renew_cg()
        vectors = token_repre(indexes, False)
        vectors = encoder(vectors, None, cfg.RNN_DROP, cfg.RNN_DROP, False)
        pred = decoder(vectors, masks, None, False, True)
        my_eval.add_truth('Test', truth)
        my_eval.add_pred('Test', pred)
    my_eval.evaluation('Test', cfg.PRED_TEST, cfg.TEST)