def evaluate_autoencode_model( model, src, src_test, config, src_valid=None, verbose=True, metric='bleu' ): """Evaluate model.""" preds = [] ground_truths = [] for j in xrange(0, len(src_test['data']), config['data']['batch_size']): print 'Decoding batch : %d out of %d ' % (j, len(src_test['data'])) input_lines_src, lens_src, mask_src = get_autoencode_minibatch( src_test['data'], src['word2id'], j, config['data']['batch_size'], config['data']['max_src_length'], add_start=True, add_end=True ) input_lines_trg = Variable(torch.LongTensor( [ [src['word2id']['<s>']] for i in xrange(input_lines_src.size(0)) ] )) for i in xrange(config['data']['max_src_length']): decoder_logit = model(input_lines_src, input_lines_trg) word_probs = model.decode(decoder_logit) decoder_argmax = word_probs.data.cpu().numpy().argmax(axis=-1) next_preds = Variable( torch.from_numpy(decoder_argmax[:, -1]) ) input_lines_trg = torch.cat( (input_lines_trg, next_preds.unsqueeze(1)), 1 ) input_lines_trg = input_lines_trg.data.cpu().numpy() input_lines_trg = [ [src['id2word'][x] for x in line] for line in input_lines_trg ] output_lines_trg_gold = input_lines_src.data.cpu().numpy() output_lines_trg_gold = [ [src['id2word'][x] for x in line] for line in output_lines_trg_gold ] for sentence_pred, sentence_real in zip( input_lines_trg, output_lines_trg_gold, ): if '</s>' in sentence_pred: index = sentence_pred.index('</s>') else: index = len(sentence_pred) preds.append(sentence_pred[:index + 1]) if verbose: print ' '.join(sentence_pred[:index + 1]) if '</s>' in sentence_real: index = sentence_real.index('</s>') else: index = len(sentence_real) if verbose: print ' '.join(sentence_real[:index + 1]) if verbose: print '--------------------------------------' ground_truths.append(sentence_real[:index + 1]) return get_bleu(preds, ground_truths)
src, src_test, config, verbose=False, metric='bleu', ) logging.info('Epoch : %d : BLEU : %.5f ' % (0, bleu)) for i in xrange(1000): losses = [] for j in xrange(0, len(src['data']), batch_size): input_lines_src, output_lines_src, lens_src, mask_src = get_autoencode_minibatch( src['data'], src['word2id'], j, batch_size, max_length, add_start=True, add_end=True) decoder_logit = model(input_lines_src) optimizer.zero_grad() loss = loss_criterion( decoder_logit.contiguous().view(-1, src_vocab_size), output_lines_src.view(-1)) losses.append(loss.data[0]) loss.backward() optimizer.step() if j % config['management']['monitor_loss'] == 0: