def cmn_crf_eval_free(config, log_dir, pr=True): word_emb = LoadEmbedding(config['eval_word_emb']) char_emb = LoadEmbedding(config['eval_char_emb']) print 'finish loading embedding' encoder = CMNBioCNNEncoder(config, word_emb, char_emb, config['dropout']) bidencoder = BidRnnBioDecoder(config, config['encoder_filter_num'], config['hidden_size'], config['decoder_output_size'], config['dropout'], config['decoder_layers']) decoder = CRF(config, config['BioOutTags'], config['hidden_size'] * 2) en_dict = torch.load(os.path.join(config['model_dir'], 'early_encoder_params.pkl')) bid_dict = torch.load(os.path.join(config['model_dir'], 'early_bidencoder_params.pkl')) de_dict = torch.load(os.path.join(config['model_dir'], 'early_decoder_params.pkl')) # en_dict = torch.load('spa_bio_model/encoder_params.pkl') # en_dict = {k.partition('module.')[2]: en_dict[k] for k in en_dict} # de_dict = {k.partition('module.')[2]: de_dict[k] for k in de_dict} # print en_dict.keys() encoder.load_state_dict(en_dict) bidencoder.load_state_dict(bid_dict) decoder.load_state_dict(de_dict) # decoder_optimizer = torch.optim.Adadelta(decoder.parameters()) # encoder_optimizer = torch.optim.Adadelta(encoder.parameters()) # decoder_optimizer.zero_grad() # encoder_optimizer.zero_grad() # batch_getter = CMNBioBatchGetter(config, config['dev_data'], 1, shuffle=False) batch_size = 100 batch_getter = CMNBioBatchGetter(config, config['dev_data'], batch_size, shuffle=False) if config['USE_CUDA']: encoder.cuda(config['cuda_num']) bidencoder.cuda(config['cuda_num']) decoder.cuda(config['cuda_num']) ner_tag = config['BioOutTags'] evaluator = BoundaryPerformance(ner_tag) evaluator.reset() out_file = codecs.open(os.path.join(log_dir, 'bio_eva_result'), mode='wb', encoding='utf-8') for i, this_batch in enumerate(batch_getter): top_path = crf_eval_one_sen(config, encoder, bidencoder, decoder, this_batch) # top_path = top_path[1:] for batch_no, path in enumerate(top_path): evaluator.evaluate(i, remove_end_tag(this_batch[1].numpy()[batch_no, :].tolist()), path, out_file, pr) # print [ner_tag.getWord(tag) for tag in top_path] # evaluator.evaluate(i, this_batch[1].numpy()[0, :].tolist(), top_path, out_file, pr) if (i + 1) * batch_size % 100 == 0: print '{} sentences processed'.format((i + 1) * batch_size) evaluator.get_performance() return evaluator.get_performance()
def build_model(self, config): if config['lang'] == 'cmn': word_emb = LoadEmbedding(config['eval_word_emb']) char_emb = LoadEmbedding(config['eval_char_emb']) print 'finish loading embedding' encoder = CMNBioCNNEncoder(config, word_emb, char_emb, dropout_p=0) decoder = BioRnnDecoder(config, config['encoder_filter_num'], config['hidden_size'], config['decoder_output_size'], config['output_dim'], 0, config['decoder_layers']) else: emb = LoadEmbedding(config['eval_emb']) # emb = config['loaded_emb'] print 'finish loading embedding' encoder = BioCNNEncoder(config, emb, dropout_p=0) decoder = BioRnnDecoder(config, config['encoder_filter_num'], config['hidden_size'], config['decoder_output_size'], config['output_dim'], 0, config['decoder_layers']) # encoder = BioCNNEncoder(config, emb, dropout_p=0) # decoder = BioRnnDecoder(config, config['encoder_filter_num'], config['hidden_size'], # config['decoder_output_size'], config['output_dim'], 0, config['decoder_layers']) en_dict = torch.load(os.path.join(config['model_dir'], 'early_encoder_params.pkl'), map_location=lambda storage, loc: storage) de_dict = torch.load(os.path.join(config['model_dir'], 'early_decoder_params.pkl'), map_location=lambda storage, loc: storage) # en_dict = torch.load('bio_model_eng/early_encoder_params.pkl') # de_dict = torch.load('bio_model_eng/early_decoder_params.pkl') # en_dict = torch.load('spa_bio_model/encoder_params.pkl') # en_dict = {k.partition('module.')[2]: en_dict[k] for k in en_dict} # de_dict = {k.partition('module.')[2]: de_dict[k] for k in de_dict} # print en_dict.keys() encoder.load_state_dict(en_dict) decoder.load_state_dict(de_dict) # decoder_optimizer = torch.optim.Adadelta(decoder.parameters()) # encoder_optimizer = torch.optim.Adadelta(encoder.parameters()) # decoder_optimizer.zero_grad() # encoder_optimizer.zero_grad() # batch_getter = BioBatchGetter(config, config['dev_data'], 1, shuffle=False) if config['USE_CUDA']: encoder.cuda(config['cuda_num']) decoder.cuda(config['cuda_num']) self.encoder = encoder self.decoder = decoder
def evaluate_all(my_arg, pr=True): emb = LoadEmbedding('res/emb.txt') print 'finish loading embedding' encoder = CNNEncoder(emb, dropout_p=0) decoder = BahdanauAttnDecoderRNN(config, config['encoder_outputs_size'], config['hidden_size'], config['decoder_output_size'], config['decoder_layers'], dropout_p=0) en_dict = torch.load('model/encoder_params.pkl') de_dict = torch.load('model/decoder_params.pkl') # en_dict = {k.partition('module.')[2]: en_dict[k] for k in en_dict} # de_dict = {k.partition('module.')[2]: de_dict[k] for k in de_dict} # print en_dict.keys() encoder.load_state_dict(en_dict) decoder.load_state_dict(de_dict) # decoder_optimizer = torch.optim.Adadelta(decoder.parameters()) # encoder_optimizer = torch.optim.Adadelta(encoder.parameters()) # decoder_optimizer.zero_grad() # encoder_optimizer.zero_grad() batch_getter = BatchGetter('data/eng_dev.txt', 1, shuffle=False) if config['USE_CUDA']: encoder.cuda(config['cuda_num']) decoder.cuda(config['cuda_num']) ner_tag = Vocab('res/ner_xx', unk_id=config['UNK_token'], pad_id=config['PAD_token']) evaluator = BoundaryPerformance(ner_tag) evaluator.reset() out_file = codecs.open('data/eva_result' + str(my_arg), mode='wb', encoding='utf-8') for i, this_batch in enumerate(batch_getter): top_path = eva_one_sentence(encoder, decoder, this_batch) top_path = top_path[1:] # print [ner_tag.getWord(tag) for tag in top_path] evaluator.evaluate(i, this_batch[1].numpy()[0, :].tolist(), top_path, out_file, pr) if i % 100 == 0: print '{} sentences processed'.format(i) evaluator.get_performance() return evaluator.get_performance()
def evaluate_all(config, my_arg, log_dir, pr=True): emb = LoadEmbedding(config['eval_emb']) print 'finish loading embedding' encoder = BioCNNEncoder(config, emb, dropout_p=0) decoder = BioRnnDecoder(config, config['encoder_filter_num'], config['hidden_size'], config['decoder_output_size'], config['output_dim'], 0, config['decoder_layers']) en_dict = torch.load(os.path.join(config['model_dir'], 'encoder_params.pkl')) de_dict = torch.load(os.path.join(config['model_dir'], 'decoder_params.pkl')) # en_dict = torch.load('spa_bio_model/encoder_params.pkl') # en_dict = {k.partition('module.')[2]: en_dict[k] for k in en_dict} # de_dict = {k.partition('module.')[2]: de_dict[k] for k in de_dict} # print en_dict.keys() encoder.load_state_dict(en_dict) decoder.load_state_dict(de_dict) # decoder_optimizer = torch.optim.Adadelta(decoder.parameters()) # encoder_optimizer = torch.optim.Adadelta(encoder.parameters()) # decoder_optimizer.zero_grad() # encoder_optimizer.zero_grad() batch_getter = BioBatchGetter(config, config['dev_data'], 1, shuffle=False) if config['USE_CUDA']: encoder.cuda(config['cuda_num']) decoder.cuda(config['cuda_num']) ner_tag = config['BioOutTags'] evaluator = BoundaryPerformance(ner_tag) evaluator.reset() out_file = codecs.open(os.path.join(log_dir, 'bio_eva_result'), mode='wb', encoding='utf-8') for i, this_batch in enumerate(batch_getter): top_path = eva_one_sentence(config, encoder, decoder, this_batch) top_path = top_path[1:] # print [ner_tag.getWord(tag) for tag in top_path] evaluator.evaluate(i, this_batch[1].numpy()[0, :].tolist(), top_path, out_file, pr) if i % 100 == 0: print '{} sentences processed'.format(i) evaluator.get_performance() return evaluator.get_performance()
print type(my_arg), my_arg config = get_conf('cmn') # get_conf(args.lang) config['model_dir'] = 'bid_' + config['model_dir'] config['decoder_layers'] = 1 config['batch_size'] = 64 # config['dropout'] = 0.25 # config['encoder_filter_num'] = 400 # config['hidden_size'] = 400 # config['encoder_outputs_size'] = config['hidden_size'] # config['USE_CUDA'] = False # my_arg = 0 char_emb = LoadEmbedding(config['char_embedding']) word_emb = LoadEmbedding(config['embedding_file']) # word_emb = LoadEmbedding(config['eval_word_emb']) # char_emb = LoadEmbedding(config['eval_char_emb']) print 'finish loading embedding' encoder = CMNBioCNNEncoder(config, word_emb, char_emb, config['dropout']) bidencoder = BidRnnBioDecoder(config, config['encoder_filter_num'], config['hidden_size'], config['decoder_output_size'], config['dropout'], config['decoder_layers']) decoder = BioRnnDecoder(config, config['hidden_size']*2, config['hidden_size'], config['decoder_output_size'], config['output_dim'], config['dropout'], 1) # en_dict = torch.load('model/encoder_params.pkl') # de_dict = torch.load('model/decoder_params.pkl')
# torch.save(encoder.state_dict(), 'model/encoder_params.pkl') return ex_iterations + iteration def train(encoder, decoder, encoder_optimizer, decoder_optimizer): batch_getter = BatchGetter('data/train') ex_iterations = 6309 for i in range(1000): result = train_epoch(i, ex_iterations, batch_getter, encoder, decoder, encoder_optimizer, decoder_optimizer) ex_iterations = result + 1 if __name__ == '__main__': print 'ddd' emb = LoadEmbedding('res/emb.txt') print 'finish loading embedding' encoder = MultiCNNEncoder(emb) decoder = MultiDecoder(config, config['encoder_outputs_size'], config['hidden_size'], config['decoder_output_size'], config['decoder_layers']) en_dict = torch.load('model/encoder_params.pkl') de_dict = torch.load('model/decoder_params.pkl') encoder.load_state_dict(en_dict) decoder.load_state_dict(de_dict) encoder = nn.DataParallel(encoder, device_ids=config['multi_cuda']) decoder = nn.DataParallel(decoder, device_ids=config['multi_cuda']) # en_dict = {k.partition('module.')[2]: en_dict[k] for k in en_dict} # de_dict = {k.partition('module.')[2]: de_dict[k] for k in de_dict} decoder_optimizer = torch.optim.Adadelta(decoder.parameters())
print lang, my_arg, log_dir, train_data, dev_data, model_dir, cuda_num, type( cuda_num) config = get_conf(lang) # get_conf(args.lang) # config['train_data'] = 'data/bio_cmn_all.txt' # config['USE_CUDA'] = False # config['batch_size'] = 8 # my_arg = 0 config['train_data'] = train_data config['dev_data'] = dev_data config['model_dir'] = model_dir config['cuda_num'] = cuda_num config['epoch'] = epoch emb = LoadEmbedding(config['embedding_file']) print 'finish loading embedding' encoder = BioCNNEncoder(config, emb, config['dropout']) decoder = BioRnnDecoder(config, config['encoder_filter_num'], config['hidden_size'], config['decoder_output_size'], config['output_dim'], config['dropout'], config['decoder_layers']) # en_dict = torch.load('model/encoder_params.pkl') # de_dict = torch.load('model/decoder_params.pkl') # en_dict = {k.partition('module.')[2]: en_dict[k] for k in en_dict} # de_dict = {k.partition('module.')[2]: de_dict[k] for k in de_dict} # encoder.load_state_dict(en_dict) # decoder.load_state_dict(de_dict) decoder_optimizer = torch.optim.Adadelta(decoder.parameters()) encoder_optimizer = torch.optim.Adadelta(encoder.parameters())
class MultiBioCNNEncoder(BioCNNEncoder): def __init__(self, loaded_embedding, dropout_p): super(MultiBioCNNEncoder, self).__init__(loaded_embedding, dropout_p) # input: (batch, seq_length) h_0: (batch, num_layers * num_directions, hidden_size) def forward(self, step, input, h_0, seq_length): step = step.data[0, 0] h_0 = h_0.transpose(0, 1).contiguous() seq_length = seq_length.cpu().data.numpy().reshape(-1).tolist() # print 'step', step, 'input', input.size(), 'h0', h_0.size(), 'seq', len(seq_length) result = super(MultiBioCNNEncoder, self).forward( step, input, h_0, seq_length) # (seq_len, batch, hidden_size * num_directions) return result.transpose(0, 1) if __name__ == '__main__': emb = LoadEmbedding('res/emb.txt') print emb.get_embedding_tensor()[0:5, :] encoder = BioCNNEncoder(emb) batch_getter = BatchGetter('data/train') data = Variable(next(batch_getter)[0]) # if config['USE_CUDA']: # encoder.cuda(config['cuda_num']) # data = data.cuda(config['cuda_num']) print data print encoder(data) pass