Ejemplo n.º 1
0
def evaluate_all(my_arg, pr=True):
    emb = LoadEmbedding('res/emb.txt')
    print 'finish loading embedding'
    # batch_getter = BatchGetter('data/dev', 'GPE_NAM', 1, False)
    batch_getter_lst = []
    if my_arg == 0:
        pernam_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa',
                                               'PER', 1, False)
        batch_getter_lst.append(pernam_batch_getter)

        loc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa',
                                            'LOC', 1, False)
        batch_getter_lst.append(loc_batch_getter)

        misc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa',
                                             'MISC', 1, False)
        batch_getter_lst.append(misc_batch_getter)

        org_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa',
                                            'ORG', 1, False)
        batch_getter_lst.append(org_batch_getter)

    if my_arg == 1:
        pernam_batch_getter = BatchGetter('data/dev', 'PER_NAM', 1, False)
        batch_getter_lst.append(pernam_batch_getter)

        fac_batch_getter = BatchGetter('data/dev', 'FAC_NAM', 1, False)
        batch_getter_lst.append(fac_batch_getter)

        loc_batch_getter = BatchGetter('data/dev', 'LOC_NAM', 1, False)
        batch_getter_lst.append(loc_batch_getter)

        gpe_batch_getter = BatchGetter('data/dev', 'GPE_NAM', 1, False)
        batch_getter_lst.append(gpe_batch_getter)

        org_batch_getter = BatchGetter('data/dev', 'ORG_NAM', 1, False)
        batch_getter_lst.append(org_batch_getter)
    if my_arg == 2:
        pernam_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa',
                                               'PER', 1, False)
        batch_getter_lst.append(pernam_batch_getter)

        loc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa',
                                            'LOC', 1, False)
        batch_getter_lst.append(loc_batch_getter)

        misc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa',
                                             'MISC', 1, False)
        batch_getter_lst.append(misc_batch_getter)

        org_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.testa',
                                            'ORG', 1, False)
        batch_getter_lst.append(org_batch_getter)

    batch_getter = MergeBatchGetter(batch_getter_lst, 1, False)
    print 'finish loading dev data'
    embedding_layer = EmbeddingLayer(emb, 0)
    d = embedding_layer.get_out_dim()
    att_layer = AttentionFlowLayer(2 * d)
    # if my_arg == 2:
    model_out_layer = ModelingOutLayer(8 * d, d, 2, 3, 0)
    # else:
    #     model_out_layer = ModelingOutLayer(8*d, d, 2, 2, 0)
    model_dir = 'model' + str(my_arg)

    embedding_layer.load_state_dict(
        torch.load(model_dir + '/embedding_layer.pkl'))
    att_layer.load_state_dict(torch.load(model_dir + '/att_layer.pkl'))
    model_out_layer.load_state_dict(
        torch.load(model_dir + '/model_out_layer.pkl'))

    # models = [embedding_layer, att_layer, model_out_layer]
    # opts = [emb_opt, att_opt, model_out_opt]
    ner_tag = Vocab('res/ner_xx',
                    unk_id=config['UNK_token'],
                    pad_id=config['PAD_token'])
    # if my_arg == 2:
    evaluator = ConllBoundaryPerformance(ner_tag)
    # else:
    #     evaluator = BoundaryPerformance(ner_tag)
    evaluator.reset()

    if config['USE_CUDA']:
        att_layer.cuda(config['cuda_num'])
        embedding_layer.cuda(config['cuda_num'])
        model_out_layer.cuda(config['cuda_num'])

    emb_opt = torch.optim.Adam(embedding_layer.parameters())
    att_opt = torch.optim.Adam(att_layer.parameters())
    model_out_opt = torch.optim.Adam(model_out_layer.parameters())
    out_file = codecs.open('data/eva_result' + str(my_arg),
                           mode='wb',
                           encoding='utf-8')

    ex_iterations = 0
    for iteration, this_batch in enumerate(batch_getter):
        target, rec = evaluate_one(ex_iterations + iteration, embedding_layer,
                                   att_layer, model_out_layer, emb_opt,
                                   att_opt, model_out_opt, this_batch)
        evaluator.evaluate(iteration,
                           target.numpy().tolist(),
                           rec.numpy().tolist(),
                           out_file,
                           pr=pr)
        if iteration % 100 == 0:
            print '{} sentences processed'.format(iteration)
            evaluator.get_performance()
    return evaluator.get_performance()
Ejemplo n.º 2
0
def main(my_arg):
    logger = Logger('./logs' + str(my_arg))
    emb = LoadEmbedding('res/embedding.txt')
    print 'finish loading embedding'
    # batch_getter = BatchGetter('data/train', 'GPE_NAM', config['batch_size'])
    batch_getter_lst = []
    if my_arg == 0:
        pernam_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.train',
                                               'PER', 1, True)
        batch_getter_lst.append(pernam_batch_getter)

        loc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.train',
                                            'LOC', 1, True)
        batch_getter_lst.append(loc_batch_getter)

        misc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.train',
                                             'MISC', 1, True)
        batch_getter_lst.append(misc_batch_getter)

        org_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.train',
                                            'ORG', 1, True)
        batch_getter_lst.append(org_batch_getter)

    if my_arg == 1:
        pernam_batch_getter = BatchGetter('data/train', 'PER_NAM', 1)
        batch_getter_lst.append(pernam_batch_getter)

        fac_batch_getter = BatchGetter('data/train', 'FAC_NAM', 1)
        batch_getter_lst.append(fac_batch_getter)

        loc_batch_getter = BatchGetter('data/train', 'LOC_NAM', 1)
        batch_getter_lst.append(loc_batch_getter)

        gpe_batch_getter = BatchGetter('data/train', 'GPE_NAM', 1)
        batch_getter_lst.append(gpe_batch_getter)

        # if my_arg == 1:
        org_batch_getter = BatchGetter('data/train', 'ORG_NAM', 1)
        batch_getter_lst.append(org_batch_getter)
    if my_arg == 2:
        pernam_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.train',
                                               'PER', 1, True)
        batch_getter_lst.append(pernam_batch_getter)

        loc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.train',
                                            'LOC', 1, True)
        batch_getter_lst.append(loc_batch_getter)

        misc_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.train',
                                             'MISC', 1, True)
        batch_getter_lst.append(misc_batch_getter)

        org_batch_getter = ConllBatchGetter('data/conll2003/bio_eng.train',
                                            'ORG', 1, True)
        batch_getter_lst.append(org_batch_getter)

    batch_getter = MergeBatchGetter(batch_getter_lst, config['batch_size'],
                                    True)
    print 'finish loading train data'
    embedding_layer = EmbeddingLayer(emb)
    d = embedding_layer.get_out_dim()
    att_layer = AttentionFlowLayer(2 * d)
    # if my_arg == 2:
    model_out_layer = ModelingOutLayer(8 * d, d, 2, 3)
    # else:
    #     model_out_layer = ModelingOutLayer(8*d, d, 2, 2)
    # models = [embedding_layer, att_layer, model_out_layer]
    # opts = [emb_opt, att_opt, model_out_opt]

    if config['USE_CUDA']:
        att_layer.cuda(config['cuda_num'])
        embedding_layer.cuda(config['cuda_num'])
        model_out_layer.cuda(config['cuda_num'])

    emb_opt = torch.optim.Adadelta(embedding_layer.parameters())
    att_opt = torch.optim.Adadelta(att_layer.parameters())
    model_out_opt = torch.optim.Adadelta(model_out_layer.parameters())

    log_file = open('log_file' + str(my_arg), 'w')
    f_max = 0
    low_epoch = 0
    ex_iterations = 0
    model_dir = 'model' + str(my_arg)
    for epoch in range(config['max_epoch']):
        for iteration, this_batch in enumerate(batch_getter):
            time0 = time.time()
            print 'epoch: {}, iteraton: {}'.format(epoch,
                                                   ex_iterations + iteration)
            train_iteration(logger, ex_iterations + iteration, embedding_layer,
                            att_layer, model_out_layer, emb_opt, att_opt,
                            model_out_opt, this_batch)
            time1 = time.time()
            print 'this iteration time: ', time1 - time0, '\n'
            if (ex_iterations + iteration) % config['save_freq'] == 0:
                torch.save(embedding_layer.state_dict(),
                           model_dir + '/embedding_layer.pkl')
                torch.save(att_layer.state_dict(),
                           model_dir + '/att_layer.pkl')
                torch.save(model_out_layer.state_dict(),
                           model_dir + '/model_out_layer.pkl')

        ex_iterations += iteration + 1
        batch_getter.reset()
        f, p, r = evaluate_all(my_arg, False)
        log_file.write('epoch: {} f: {} p: {} r: {}\n'.format(epoch, f, p, r))
        log_file.flush()
        if f >= f_max:
            f_max = f
            low_epoch = 0
            # torch.save(embedding_layer.state_dict(), model_dir+'/early_embedding_layer.pkl')
            # torch.save(att_layer.state_dict(), model_dir+'/early_att_layer.pkl')
            # torch.save(model_out_layer.state_dict(), model_dir+'/early_model_out_layer.pkl')
            os.system('cp {}/embedding_layer.pkl {}/early_embedding_layer.pkl'.
                      format(model_dir, model_dir))
            os.system('cp {}/att_layer.pkl {}/early_att_layer.pkl'.format(
                model_dir, model_dir))
            os.system('cp {}/model_out_layer.pkl {}/early_model_out_layer.pkl'.
                      format(model_dir, model_dir))

        else:
            low_epoch += 1
            log_file.write('low' + str(low_epoch) + '\n')
            log_file.flush()
        if low_epoch >= config['early_stop']:
            break
    log_file.close()