Example #1
0
    def cmn_crf_eval_free(config, log_dir, pr=True):
        word_emb = LoadEmbedding(config['eval_word_emb'])
        char_emb = LoadEmbedding(config['eval_char_emb'])
        print 'finish loading embedding'
        encoder = CMNBioCNNEncoder(config, word_emb, char_emb, config['dropout'])
        bidencoder = BidRnnBioDecoder(config, config['encoder_filter_num'], config['hidden_size'],
                                      config['decoder_output_size'], config['dropout'], config['decoder_layers'])

        decoder = CRF(config, config['BioOutTags'], config['hidden_size'] * 2)
        en_dict = torch.load(os.path.join(config['model_dir'], 'early_encoder_params.pkl'))
        bid_dict = torch.load(os.path.join(config['model_dir'], 'early_bidencoder_params.pkl'))
        de_dict = torch.load(os.path.join(config['model_dir'], 'early_decoder_params.pkl'))
        # en_dict = torch.load('spa_bio_model/encoder_params.pkl')
        # en_dict = {k.partition('module.')[2]: en_dict[k] for k in en_dict}
        # de_dict = {k.partition('module.')[2]: de_dict[k] for k in de_dict}
        # print en_dict.keys()
        encoder.load_state_dict(en_dict)
        bidencoder.load_state_dict(bid_dict)
        decoder.load_state_dict(de_dict)
        # decoder_optimizer = torch.optim.Adadelta(decoder.parameters())
        # encoder_optimizer = torch.optim.Adadelta(encoder.parameters())
        # decoder_optimizer.zero_grad()
        # encoder_optimizer.zero_grad()
        # batch_getter = CMNBioBatchGetter(config, config['dev_data'], 1, shuffle=False)
        batch_size = 100
        batch_getter = CMNBioBatchGetter(config, config['dev_data'], batch_size, shuffle=False)

        if config['USE_CUDA']:
            encoder.cuda(config['cuda_num'])
            bidencoder.cuda(config['cuda_num'])
            decoder.cuda(config['cuda_num'])

        ner_tag = config['BioOutTags']
        evaluator = BoundaryPerformance(ner_tag)
        evaluator.reset()

        out_file = codecs.open(os.path.join(log_dir, 'bio_eva_result'), mode='wb', encoding='utf-8')

        for i, this_batch in enumerate(batch_getter):
            top_path = crf_eval_one_sen(config, encoder, bidencoder, decoder, this_batch)
            # top_path = top_path[1:]
            for batch_no, path in enumerate(top_path):
                evaluator.evaluate(i, remove_end_tag(this_batch[1].numpy()[batch_no, :].tolist()), path, out_file, pr)
            # print [ner_tag.getWord(tag) for tag in top_path]
            # evaluator.evaluate(i, this_batch[1].numpy()[0, :].tolist(), top_path, out_file, pr)
            if (i + 1) * batch_size % 100 == 0:
                print '{} sentences processed'.format((i + 1) * batch_size)
                evaluator.get_performance()

        return evaluator.get_performance()
Example #2
0
    def build_model(self, config):
        if config['lang'] == 'cmn':
            word_emb = LoadEmbedding(config['eval_word_emb'])
            char_emb = LoadEmbedding(config['eval_char_emb'])
            print 'finish loading embedding'
            encoder = CMNBioCNNEncoder(config, word_emb, char_emb, dropout_p=0)
            decoder = BioRnnDecoder(config, config['encoder_filter_num'], config['hidden_size'],
                                    config['decoder_output_size'], config['output_dim'], 0, config['decoder_layers'])
        else:
            emb = LoadEmbedding(config['eval_emb'])
            # emb = config['loaded_emb']
            print 'finish loading embedding'
            encoder = BioCNNEncoder(config, emb, dropout_p=0)
            decoder = BioRnnDecoder(config, config['encoder_filter_num'], config['hidden_size'],
                                    config['decoder_output_size'], config['output_dim'], 0, config['decoder_layers'])

        # encoder = BioCNNEncoder(config, emb, dropout_p=0)
        # decoder = BioRnnDecoder(config, config['encoder_filter_num'], config['hidden_size'],
        #                         config['decoder_output_size'], config['output_dim'], 0, config['decoder_layers'])
        en_dict = torch.load(os.path.join(config['model_dir'], 'early_encoder_params.pkl'), map_location=lambda storage, loc: storage)
        de_dict = torch.load(os.path.join(config['model_dir'], 'early_decoder_params.pkl'), map_location=lambda storage, loc: storage)
        # en_dict = torch.load('bio_model_eng/early_encoder_params.pkl')
        # de_dict = torch.load('bio_model_eng/early_decoder_params.pkl')
        # en_dict = torch.load('spa_bio_model/encoder_params.pkl')
        # en_dict = {k.partition('module.')[2]: en_dict[k] for k in en_dict}
        # de_dict = {k.partition('module.')[2]: de_dict[k] for k in de_dict}
        # print en_dict.keys()
        encoder.load_state_dict(en_dict)
        decoder.load_state_dict(de_dict)
        # decoder_optimizer = torch.optim.Adadelta(decoder.parameters())
        # encoder_optimizer = torch.optim.Adadelta(encoder.parameters())
        # decoder_optimizer.zero_grad()
        # encoder_optimizer.zero_grad()
        # batch_getter = BioBatchGetter(config, config['dev_data'], 1, shuffle=False)
        if config['USE_CUDA']:
            encoder.cuda(config['cuda_num'])
            decoder.cuda(config['cuda_num'])
        self.encoder = encoder
        self.decoder = decoder
Example #3
0
def evaluate_all(my_arg, pr=True):
    emb = LoadEmbedding('res/emb.txt')
    print 'finish loading embedding'
    encoder = CNNEncoder(emb, dropout_p=0)
    decoder = BahdanauAttnDecoderRNN(config,
                                     config['encoder_outputs_size'],
                                     config['hidden_size'],
                                     config['decoder_output_size'],
                                     config['decoder_layers'],
                                     dropout_p=0)
    en_dict = torch.load('model/encoder_params.pkl')
    de_dict = torch.load('model/decoder_params.pkl')
    # en_dict = {k.partition('module.')[2]: en_dict[k] for k in en_dict}
    # de_dict = {k.partition('module.')[2]: de_dict[k] for k in de_dict}
    # print en_dict.keys()
    encoder.load_state_dict(en_dict)
    decoder.load_state_dict(de_dict)
    # decoder_optimizer = torch.optim.Adadelta(decoder.parameters())
    # encoder_optimizer = torch.optim.Adadelta(encoder.parameters())
    # decoder_optimizer.zero_grad()
    # encoder_optimizer.zero_grad()
    batch_getter = BatchGetter('data/eng_dev.txt', 1, shuffle=False)
    if config['USE_CUDA']:
        encoder.cuda(config['cuda_num'])
        decoder.cuda(config['cuda_num'])

    ner_tag = Vocab('res/ner_xx',
                    unk_id=config['UNK_token'],
                    pad_id=config['PAD_token'])
    evaluator = BoundaryPerformance(ner_tag)
    evaluator.reset()

    out_file = codecs.open('data/eva_result' + str(my_arg),
                           mode='wb',
                           encoding='utf-8')

    for i, this_batch in enumerate(batch_getter):
        top_path = eva_one_sentence(encoder, decoder, this_batch)
        top_path = top_path[1:]
        # print [ner_tag.getWord(tag) for tag in top_path]
        evaluator.evaluate(i, this_batch[1].numpy()[0, :].tolist(), top_path,
                           out_file, pr)
        if i % 100 == 0:
            print '{} sentences processed'.format(i)
            evaluator.get_performance()

    return evaluator.get_performance()
Example #4
0
def evaluate_all(config, my_arg, log_dir, pr=True):
    emb = LoadEmbedding(config['eval_emb'])
    print 'finish loading embedding'
    encoder = BioCNNEncoder(config, emb, dropout_p=0)
    decoder = BioRnnDecoder(config, config['encoder_filter_num'], config['hidden_size'],
                                     config['decoder_output_size'], config['output_dim'], 0, config['decoder_layers'])
    en_dict = torch.load(os.path.join(config['model_dir'], 'encoder_params.pkl'))
    de_dict = torch.load(os.path.join(config['model_dir'], 'decoder_params.pkl'))
    # en_dict = torch.load('spa_bio_model/encoder_params.pkl')
    # en_dict = {k.partition('module.')[2]: en_dict[k] for k in en_dict}
    # de_dict = {k.partition('module.')[2]: de_dict[k] for k in de_dict}
    # print en_dict.keys()
    encoder.load_state_dict(en_dict)
    decoder.load_state_dict(de_dict)
    # decoder_optimizer = torch.optim.Adadelta(decoder.parameters())
    # encoder_optimizer = torch.optim.Adadelta(encoder.parameters())
    # decoder_optimizer.zero_grad()
    # encoder_optimizer.zero_grad()
    batch_getter = BioBatchGetter(config, config['dev_data'], 1, shuffle=False)
    if config['USE_CUDA']:
        encoder.cuda(config['cuda_num'])
        decoder.cuda(config['cuda_num'])

    ner_tag = config['BioOutTags']
    evaluator = BoundaryPerformance(ner_tag)
    evaluator.reset()

    out_file = codecs.open(os.path.join(log_dir, 'bio_eva_result'), mode='wb', encoding='utf-8')

    for i, this_batch in enumerate(batch_getter):
        top_path = eva_one_sentence(config, encoder, decoder, this_batch)
        top_path = top_path[1:]
        # print [ner_tag.getWord(tag) for tag in top_path]
        evaluator.evaluate(i, this_batch[1].numpy()[0, :].tolist(), top_path, out_file, pr)
        if i % 100 == 0:
            print '{} sentences processed'.format(i)
            evaluator.get_performance()

    return evaluator.get_performance()
Example #5
0
    print type(my_arg), my_arg


    config = get_conf('cmn')  # get_conf(args.lang)
    config['model_dir'] = 'bid_' + config['model_dir']
    config['decoder_layers'] = 1
    config['batch_size'] = 64
    # config['dropout'] = 0.25
    # config['encoder_filter_num'] = 400
    # config['hidden_size'] = 400
    # config['encoder_outputs_size'] = config['hidden_size']
    # config['USE_CUDA'] = False

    # my_arg = 0

    char_emb = LoadEmbedding(config['char_embedding'])
    word_emb = LoadEmbedding(config['embedding_file'])

    # word_emb = LoadEmbedding(config['eval_word_emb'])
    # char_emb = LoadEmbedding(config['eval_char_emb'])

    print 'finish loading embedding'
    encoder = CMNBioCNNEncoder(config, word_emb, char_emb, config['dropout'])
    bidencoder = BidRnnBioDecoder(config, config['encoder_filter_num'], config['hidden_size'],
                                     config['decoder_output_size'], config['dropout'], config['decoder_layers'])

    decoder = BioRnnDecoder(config, config['hidden_size']*2, config['hidden_size'],
                            config['decoder_output_size'], config['output_dim'], config['dropout'],
                            1)
    # en_dict = torch.load('model/encoder_params.pkl')
    # de_dict = torch.load('model/decoder_params.pkl')
Example #6
0
        #     torch.save(encoder.state_dict(), 'model/encoder_params.pkl')
    return ex_iterations + iteration


def train(encoder, decoder, encoder_optimizer, decoder_optimizer):
    batch_getter = BatchGetter('data/train')
    ex_iterations = 6309
    for i in range(1000):
        result = train_epoch(i, ex_iterations, batch_getter, encoder, decoder,
                             encoder_optimizer, decoder_optimizer)
        ex_iterations = result + 1


if __name__ == '__main__':
    print 'ddd'
    emb = LoadEmbedding('res/emb.txt')
    print 'finish loading embedding'
    encoder = MultiCNNEncoder(emb)
    decoder = MultiDecoder(config, config['encoder_outputs_size'],
                           config['hidden_size'],
                           config['decoder_output_size'],
                           config['decoder_layers'])
    en_dict = torch.load('model/encoder_params.pkl')
    de_dict = torch.load('model/decoder_params.pkl')
    encoder.load_state_dict(en_dict)
    decoder.load_state_dict(de_dict)
    encoder = nn.DataParallel(encoder, device_ids=config['multi_cuda'])
    decoder = nn.DataParallel(decoder, device_ids=config['multi_cuda'])
    # en_dict = {k.partition('module.')[2]: en_dict[k] for k in en_dict}
    # de_dict = {k.partition('module.')[2]: de_dict[k] for k in de_dict}
    decoder_optimizer = torch.optim.Adadelta(decoder.parameters())
Example #7
0
    print lang, my_arg, log_dir, train_data, dev_data, model_dir, cuda_num, type(
        cuda_num)

    config = get_conf(lang)  # get_conf(args.lang)
    # config['train_data'] = 'data/bio_cmn_all.txt'
    # config['USE_CUDA'] = False
    # config['batch_size'] = 8

    # my_arg = 0
    config['train_data'] = train_data
    config['dev_data'] = dev_data
    config['model_dir'] = model_dir
    config['cuda_num'] = cuda_num
    config['epoch'] = epoch

    emb = LoadEmbedding(config['embedding_file'])
    print 'finish loading embedding'
    encoder = BioCNNEncoder(config, emb, config['dropout'])
    decoder = BioRnnDecoder(config, config['encoder_filter_num'],
                            config['hidden_size'],
                            config['decoder_output_size'],
                            config['output_dim'], config['dropout'],
                            config['decoder_layers'])
    # en_dict = torch.load('model/encoder_params.pkl')
    # de_dict = torch.load('model/decoder_params.pkl')
    # en_dict = {k.partition('module.')[2]: en_dict[k] for k in en_dict}
    # de_dict = {k.partition('module.')[2]: de_dict[k] for k in de_dict}
    # encoder.load_state_dict(en_dict)
    # decoder.load_state_dict(de_dict)
    decoder_optimizer = torch.optim.Adadelta(decoder.parameters())
    encoder_optimizer = torch.optim.Adadelta(encoder.parameters())
Example #8
0
class MultiBioCNNEncoder(BioCNNEncoder):
    def __init__(self, loaded_embedding, dropout_p):
        super(MultiBioCNNEncoder, self).__init__(loaded_embedding, dropout_p)

    # input: (batch, seq_length) h_0: (batch, num_layers * num_directions, hidden_size)
    def forward(self, step, input, h_0, seq_length):
        step = step.data[0, 0]
        h_0 = h_0.transpose(0, 1).contiguous()
        seq_length = seq_length.cpu().data.numpy().reshape(-1).tolist()
        # print 'step', step, 'input', input.size(), 'h0', h_0.size(), 'seq', len(seq_length)
        result = super(MultiBioCNNEncoder, self).forward(
            step, input, h_0,
            seq_length)  # (seq_len, batch, hidden_size * num_directions)
        return result.transpose(0, 1)


if __name__ == '__main__':
    emb = LoadEmbedding('res/emb.txt')
    print emb.get_embedding_tensor()[0:5, :]
    encoder = BioCNNEncoder(emb)

    batch_getter = BatchGetter('data/train')
    data = Variable(next(batch_getter)[0])
    # if config['USE_CUDA']:
    #     encoder.cuda(config['cuda_num'])
    #     data = data.cuda(config['cuda_num'])
    print data
    print encoder(data)
    pass