Exemple #1
0
def evaluate_all(my_arg, pr=True):
    emb = LoadEmbedding('res/emb.txt')
    print 'finish loading embedding'
    encoder = CNNEncoder(emb, dropout_p=0)
    decoder = BahdanauAttnDecoderRNN(config,
                                     config['encoder_outputs_size'],
                                     config['hidden_size'],
                                     config['decoder_output_size'],
                                     config['decoder_layers'],
                                     dropout_p=0)
    en_dict = torch.load('model/encoder_params.pkl')
    de_dict = torch.load('model/decoder_params.pkl')
    # en_dict = {k.partition('module.')[2]: en_dict[k] for k in en_dict}
    # de_dict = {k.partition('module.')[2]: de_dict[k] for k in de_dict}
    # print en_dict.keys()
    encoder.load_state_dict(en_dict)
    decoder.load_state_dict(de_dict)
    # decoder_optimizer = torch.optim.Adadelta(decoder.parameters())
    # encoder_optimizer = torch.optim.Adadelta(encoder.parameters())
    # decoder_optimizer.zero_grad()
    # encoder_optimizer.zero_grad()
    batch_getter = BatchGetter('data/eng_dev.txt', 1, shuffle=False)
    if config['USE_CUDA']:
        encoder.cuda(config['cuda_num'])
        decoder.cuda(config['cuda_num'])

    ner_tag = Vocab('res/ner_xx',
                    unk_id=config['UNK_token'],
                    pad_id=config['PAD_token'])
    evaluator = BoundaryPerformance(ner_tag)
    evaluator.reset()

    out_file = codecs.open('data/eva_result' + str(my_arg),
                           mode='wb',
                           encoding='utf-8')

    for i, this_batch in enumerate(batch_getter):
        top_path = eva_one_sentence(encoder, decoder, this_batch)
        top_path = top_path[1:]
        # print [ner_tag.getWord(tag) for tag in top_path]
        evaluator.evaluate(i, this_batch[1].numpy()[0, :].tolist(), top_path,
                           out_file, pr)
        if i % 100 == 0:
            print '{} sentences processed'.format(i)
            evaluator.get_performance()

    return evaluator.get_performance()
        patch = utils.wiki_short2full_patch()
        short2full.update(patch)
    elif fg_config['data'] == 'bbn':
        short2full = get_short2full_map(utils.get_bbn_types())
    for iteration, this_batch in enumerate(batch_getter):
        pred, label = evaluate_one(ex_iterations + iteration, word_embedding_layer, type_embedding_layer,
                                   ctx_lstm, ctx_att, warp_loss, this_batch)

        # evaluator.evaluate(label, pred, type_lst, short2full)
        evaluator.evaluate(this_batch['types_str'], pred, type_lst, short2full)
        if (iteration+1)*batch_size % 100 == 0:
            print('{} sentences processed'.format((iteration+1)*batch_size))
            evaluator.get_performance()
    return evaluator.get_performance()


if __name__ == '__main__':
    fg_config['cuda_num'] = 0
    fg_config['batch_size'] = 64
    fg_config['att'] = 'label_att'
    fg_config['zero_shot'] = True
    fg_config['no_zero'] = 'all'
    fg_config['topk'] = 3
    fg_config['data'] = 'bbn'
    fg_config['type_id'] = Vocab('res/{}/zero_type_voc.txt'.format(fg_config['data']), unk_id=fg_config['UNK_token'],
                                 pad_id=fg_config['PAD_token'])
    torch.cuda.set_device(fg_config['cuda_num'])
    evaluate_free(0)


Exemple #3
0
def evaluate_all(my_arg, pr=True):
    emb = LoadEmbedding('res/emb.txt')
    print 'finish loading embedding'
    # batch_getter = BatchGetter('data/dev', 'GPE_NAM', 1, False)
    batch_getter_lst = []
    if my_arg == 0:
        pernam_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa',
                                               'PER', 1, False)
        batch_getter_lst.append(pernam_batch_getter)

        loc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa',
                                            'LOC', 1, False)
        batch_getter_lst.append(loc_batch_getter)

        misc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa',
                                             'MISC', 1, False)
        batch_getter_lst.append(misc_batch_getter)

        org_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa',
                                            'ORG', 1, False)
        batch_getter_lst.append(org_batch_getter)

    if my_arg == 1:
        pernam_batch_getter = BatchGetter('data/dev', 'PER_NAM', 1, False)
        batch_getter_lst.append(pernam_batch_getter)

        fac_batch_getter = BatchGetter('data/dev', 'FAC_NAM', 1, False)
        batch_getter_lst.append(fac_batch_getter)

        loc_batch_getter = BatchGetter('data/dev', 'LOC_NAM', 1, False)
        batch_getter_lst.append(loc_batch_getter)

        gpe_batch_getter = BatchGetter('data/dev', 'GPE_NAM', 1, False)
        batch_getter_lst.append(gpe_batch_getter)

        org_batch_getter = BatchGetter('data/dev', 'ORG_NAM', 1, False)
        batch_getter_lst.append(org_batch_getter)
    if my_arg == 2:
        pernam_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa',
                                               'PER', 1, False)
        batch_getter_lst.append(pernam_batch_getter)

        loc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa',
                                            'LOC', 1, False)
        batch_getter_lst.append(loc_batch_getter)

        misc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa',
                                             'MISC', 1, False)
        batch_getter_lst.append(misc_batch_getter)

        org_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa',
                                            'ORG', 1, False)
        batch_getter_lst.append(org_batch_getter)

    batch_getter = MergeBatchGetter(batch_getter_lst, 1, False)
    print 'finish loading dev data'
    embedding_layer = EmbeddingLayer(emb, 0)
    d = embedding_layer.get_out_dim()
    att_layer = AttentionFlowLayer(2 * d)
    # if my_arg == 2:
    model_out_layer = ModelingOutLayer(8 * d, d, 2, 3, 0)
    # else:
    #     model_out_layer = ModelingOutLayer(8*d, d, 2, 2, 0)
    model_dir = 'model' + str(my_arg)

    embedding_layer.load_state_dict(
        torch.load(model_dir + '/embedding_layer.pkl'))
    att_layer.load_state_dict(torch.load(model_dir + '/att_layer.pkl'))
    model_out_layer.load_state_dict(
        torch.load(model_dir + '/model_out_layer.pkl'))

    # models = [embedding_layer, att_layer, model_out_layer]
    # opts = [emb_opt, att_opt, model_out_opt]
    ner_tag = Vocab('res/ner_xx',
                    unk_id=config['UNK_token'],
                    pad_id=config['PAD_token'])
    # if my_arg == 2:
    evaluator = ConllBoundaryPerformance(ner_tag)
    # else:
    #     evaluator = BoundaryPerformance(ner_tag)
    evaluator.reset()

    if config['USE_CUDA']:
        att_layer.cuda(config['cuda_num'])
        embedding_layer.cuda(config['cuda_num'])
        model_out_layer.cuda(config['cuda_num'])

    emb_opt = torch.optim.Adam(embedding_layer.parameters())
    att_opt = torch.optim.Adam(att_layer.parameters())
    model_out_opt = torch.optim.Adam(model_out_layer.parameters())
    out_file = codecs.open('data/eva_result' + str(my_arg),
                           mode='wb',
                           encoding='utf-8')

    ex_iterations = 0
    for iteration, this_batch in enumerate(batch_getter):
        target, rec = evaluate_one(ex_iterations + iteration, embedding_layer,
                                   att_layer, model_out_layer, emb_opt,
                                   att_opt, model_out_opt, this_batch)
        evaluator.evaluate(iteration,
                           target.numpy().tolist(),
                           rec.numpy().tolist(),
                           out_file,
                           pr=pr)
        if iteration % 100 == 0:
            print '{} sentences processed'.format(iteration)
            evaluator.get_performance()
    return evaluator.get_performance()
    def free_evaluate_all(my_arg, pr=True):
        emb = LoadEmbedding('res/emb.txt')
        if config['label_emb'] or config['question_alone']:
            onto_emb = LoadEmbedding('res/onto_embedding.txt')
        print('finish loading embedding')
        batch_getter_lst = []
        if my_arg == 0:
            # pernam_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa', 'PER', 1, False)
            # batch_getter_lst.append(pernam_batch_getter)

            # loc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa', 'LOC', 1, False)
            # batch_getter_lst.append(loc_batch_getter)

            misc_batch_getter = ConllBatchGetter(
                'data/conll2003/bio_eng.testa', 'MISC', 1, False)
            batch_getter_lst.append(misc_batch_getter)

            # org_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa', 'ORG', 1, False)
            # batch_getter_lst.append(org_batch_getter)

        if my_arg == 1:
            # pernam_batch_getter = ConllBatchGetter('data/ttt', 'PER', 1, False)
            # batch_getter_lst.append(pernam_batch_getter)
            # pernam_batch_getter = ConllBatchGetter('data/ttt', 'singer', 1, False)
            # batch_getter_lst.append(pernam_batch_getter)
            pernam_batch_getter = ConllBatchGetter(
                'data/conll2003/bio_eng.testb', 'PER', 1, False)
            batch_getter_lst.append(pernam_batch_getter)

            loc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testb',
                                                'LOC', 1, False)
            batch_getter_lst.append(loc_batch_getter)
            #
            misc_batch_getter = ConllBatchGetter(
                'data/conll2003/bio_eng.testb', 'MISC', 1, False)
            batch_getter_lst.append(misc_batch_getter)

            org_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testb',
                                                'ORG', 1, False)
            batch_getter_lst.append(org_batch_getter)
        if my_arg == 2:
            # pernam_batch_getter = ConllBatchGetter('data/conll2003/bioes_eng.testb', 'food', 1, False)
            # batch_getter_lst.append(pernam_batch_getter)
            pernam_batch_getter = ConllBatchGetter(
                'data/conll2003/bioes_eng.testb', 'PER', 1, False)
            batch_getter_lst.append(pernam_batch_getter)

            loc_batch_getter = ConllBatchGetter(
                'data/conll2003/bioes_eng.testb', 'LOC', 1, False)
            batch_getter_lst.append(loc_batch_getter)

            misc_batch_getter = ConllBatchGetter(
                'data/conll2003/bioes_eng.testb', 'MISC', 1, False)
            batch_getter_lst.append(misc_batch_getter)

            org_batch_getter = ConllBatchGetter(
                'data/conll2003/bioes_eng.testb', 'ORG', 1, False)
            batch_getter_lst.append(org_batch_getter)
        if my_arg == 3:
            # onto_notes = OntoNotesGetter('data/OntoNotes/test.json', '/person', 1, False)
            # batch_getter_lst.append(onto_notes)
            onto_notes_data = OntoNotesGetter('data/OntoNotes/test.json',
                                              utils.get_ontoNotes_type_lst(),
                                              1, False)
            batch_getter_lst.append(onto_notes_data)
        batch_size = 100
        batch_getter = MergeBatchGetter(batch_getter_lst,
                                        batch_size,
                                        False,
                                        data_name=config['data'])
        print('finish loading dev data')
        # if config['data'] == 'OntoNotes':
        #     emb_onto = True
        # else:
        #     emb_onto = False
        embedding_layer = EmbeddingLayer(emb, 0)
        if config['label_emb']:
            q_word_embedding = nn.Embedding(onto_emb.get_voc_size(),
                                            onto_emb.get_emb_size())
            q_word_embedding.weight.data.copy_(onto_emb.get_embedding_tensor())
            q_word_embedding.weight.requires_grad = False
        else:
            q_word_embedding = None
        d = config['hidden_size']
        if config['question_alone']:
            q_emb_layer = QLabel(onto_emb, 0)
        else:
            q_emb_layer = None
        att_layer = AttentionFlowLayer(2 * d)
        model_layer = ModelingLayer(8 * d, d, 2, 0)
        ner_hw_layer = NerHighway(2 * d, 8 * d, 1)
        ner_out_layer = NerOutLayer(10 * d, len(config['Tags']), 0)
        crf = CRF(config, config['Tags'], len(config['Tags']))
        if config['USE_CUDA']:
            att_layer.cuda(config['cuda_num'])
            embedding_layer.cuda(config['cuda_num'])
            if config['label_emb']:
                q_word_embedding.cuda(config['cuda_num'])
            model_layer.cuda(config['cuda_num'])
            ner_hw_layer.cuda(config['cuda_num'])
            ner_out_layer.cuda(config['cuda_num'])
            crf.cuda(config['cuda_num'])
            if config['question_alone']:
                q_emb_layer.cuda(config['cuda_num'])
        model_dir = 'ner_model8'

        att_layer.load_state_dict(
            torch.load(model_dir + '/early_att_layer.pkl',
                       map_location=lambda storage, loc: storage))
        model_layer.load_state_dict(
            torch.load(model_dir + '/early_model_layer.pkl',
                       map_location=lambda storage, loc: storage))
        ner_hw_layer.load_state_dict(
            torch.load(model_dir + '/early_ner_hw_layer.pkl',
                       map_location=lambda storage, loc: storage))
        ner_out_layer.load_state_dict(
            torch.load(model_dir + '/early_ner_out_layer.pkl',
                       map_location=lambda storage, loc: storage))
        crf.load_state_dict(
            torch.load(model_dir + '/early_crf.pkl',
                       map_location=lambda storage, loc: storage))
        embedding_layer.load_state_dict(
            torch.load(model_dir + '/early_embedding_layer.pkl',
                       map_location=lambda storage, loc: storage))
        if config['question_alone']:
            q_emb_layer.load_state_dict(
                torch.load(model_dir + '/q_emb_layer.pkl',
                           map_location=lambda storage, loc: storage))
        else:
            q_emb_layer = None
        if config['question_alone']:
            q_emb_layer.eval()
        embedding_layer.eval()
        att_layer.eval()
        model_layer.eval()
        ner_hw_layer.eval()
        ner_out_layer.eval()
        crf.eval()

        ner_tag = Vocab('res/ner_xx',
                        unk_id=config['UNK_token'],
                        pad_id=config['PAD_token'])
        if my_arg == 3:
            evaluator = ConllBoundaryPerformance(ner_tag, onto_notes_data)
        else:
            evaluator = ConllBoundaryPerformance(ner_tag)
        evaluator.reset()
        out_file = codecs.open('data/eva_result' + str(my_arg),
                               mode='wb',
                               encoding='utf-8')
        # writer.add_embedding(embedding_layer.word_embedding.weight.data.cpu())
        # return
        all_emb = None
        all_metadata = []
        ex_iterations = 0
        summary_emb = False
        for iteration, this_batch in enumerate(batch_getter):
            # if iteration >= 15:
            #     break

            if summary_emb:
                top_path, all_emb, all_metadata, q = evaluate_one(
                    ex_iterations + iteration, embedding_layer,
                    q_word_embedding, q_emb_layer, att_layer, model_layer,
                    ner_hw_layer, ner_out_layer, crf, this_batch, summary_emb,
                    all_emb, all_metadata)
            else:
                top_path = evaluate_one(ex_iterations + iteration,
                                        embedding_layer, q_word_embedding,
                                        q_emb_layer, att_layer, model_layer,
                                        ner_hw_layer, ner_out_layer, crf,
                                        this_batch)
            for batch_no, path in enumerate(top_path):
                evaluator.evaluate(
                    iteration * batch_size + batch_no,
                    remove_end_tag(
                        this_batch[1].numpy()[batch_no, :].tolist()), path,
                    out_file, pr)
            if (iteration + 1) * batch_size % 100 == 0:
                print('{} sentences processed'.format(
                    (iteration + 1) * batch_size))
                evaluator.get_performance()
        if summary_emb:
            writer.add_embedding(torch.cat([q, all_emb], 0),
                                 metadata=['question'] + all_metadata)
        return evaluator.get_performance()
Exemple #5
0
    # en_dict = {k.partition('module.')[2]: en_dict[k] for k in en_dict}
    # de_dict = {k.partition('module.')[2]: de_dict[k] for k in de_dict}
    print en_dict.keys()
    encoder.load_state_dict(en_dict)
    decoder.load_state_dict(de_dict)
    decoder_optimizer = torch.optim.Adadelta(decoder.parameters())
    encoder_optimizer = torch.optim.Adadelta(encoder.parameters())
    decoder_optimizer.zero_grad()
    encoder_optimizer.zero_grad()
    batch_getter = BatchGetter('data/dev', 1, shuffle=False)
    if config['USE_CUDA']:
        encoder.cuda(config['cuda_num'])
        decoder.cuda(config['cuda_num'])

    ner_tag = Vocab('res/ner_xx',
                    unk_id=config['UNK_token'],
                    pad_id=config['PAD_token'])
    evaluator = BoundaryPerformance(ner_tag)
    evaluator.reset()

    out_file = codecs.open('data/eva_result.txt', mode='wb', encoding='utf-8')

    for i, this_batch in enumerate(batch_getter):
        top_path = eva_one_sentence(encoder, decoder, this_batch)
        top_path = top_path[1:]
        # print [ner_tag.getWord(tag) for tag in top_path]
        evaluator.evaluate(i, this_batch[1].numpy()[0, :].tolist(), top_path,
                           out_file)
        if i % 100 == 0:
            print '{} sentences processed'.format(i)
            evaluator.get_performance()
Exemple #6
0
        evaluator.reset()

        out_file = codecs.open(os.path.join(log_dir, 'bio_eva_result'), mode='wb', encoding='utf-8')

        for i, this_batch in enumerate(batch_getter):
            top_path = crf_eval_one_sen(config, encoder, bidencoder, decoder, this_batch)
            # top_path = top_path[1:]
            for batch_no, path in enumerate(top_path):
                evaluator.evaluate(i, remove_end_tag(this_batch[1].numpy()[batch_no, :].tolist()), path, out_file, pr)
            # print [ner_tag.getWord(tag) for tag in top_path]
            # evaluator.evaluate(i, this_batch[1].numpy()[0, :].tolist(), top_path, out_file, pr)
            if (i + 1) * batch_size % 100 == 0:
                print '{} sentences processed'.format((i + 1) * batch_size)
                evaluator.get_performance()

        return evaluator.get_performance()
    config = get_conf('cmn')
    config['dev_data'] = 'data/bio_cmn_test.txt'
    config['decoder_output_size'] = 25
    config['model_dir'] = 'crf_' + config['model_dir']
    config['decoder_layers'] = 1
    config['BioOutTags'] = Vocab('res_cmn/crf_ner_bio.txt', unk_id=config['UNK_token'], pad_id=config['PAD_token'])

    f, p, r = cmn_crf_eval_free(config, 'test', False)