def main(): args = parse_args() config = configparser.ConfigParser() """ARGS DETAIL""" config_file = args.config_file batch_size = args.batch n_epoch = args.epoch pretrain_epoch = args.pretrain_epoch gpu_id = args.gpu model_type = args.model vocab_type = args.vocab pretrain_w2v = args.pretrain_w2v data_path = args.data_path """DIR PREPARE""" config.read(config_file) vocab_size = int(config['Parameter']['vocab_size']) coefficient = float(config['Parameter']['coefficient']) vocab_name = vocab_type if pretrain_w2v: vocab_name = 'p' + vocab_name if model_type == 'multi': model_dir = './super_{}_{}{}_{}_c{}/'.format(model_type, vocab_name, vocab_size, data_path[0], coefficient) else: model_dir = './super_{}_{}{}_{}/'.format(model_type, vocab_name, vocab_size, data_path[0]) if not os.path.exists(model_dir): os.mkdir(model_dir) shutil.copyfile(config_file, model_dir + config_file) config_file = model_dir + config_file config.read(config_file) """PARAMATER""" embed_size = int(config['Parameter']['embed_size']) hidden_size = int(config['Parameter']['hidden_size']) class_size = int(config['Parameter']['class_size']) dropout_ratio = float(config['Parameter']['dropout']) weight_decay = float(config['Parameter']['weight_decay']) gradclip = float(config['Parameter']['gradclip']) vocab_size = int(config['Parameter']['vocab_size']) coefficient = float(config['Parameter']['coefficient']) valid_num = int(config['Parameter']['valid_num']) """LOGGER""" log_file = model_dir + 'log.txt' logger = dataset.prepare_logger(log_file) logger.info(args) # 引数を記録 logger.info('[Training start] logging to {}'.format(log_file)) """DATASET""" train_src_file = config[data_path]['train_src_file'] train_trg_file = config[data_path]['train_trg_file'] valid_src_file = config[data_path]['valid_src_file'] valid_trg_file = config[data_path]['valid_trg_file'] test_src_file = config[data_path]['single_src_file'] test_trg_file = config[data_path]['single_trg_file'] src_w2v_file = config[data_path]['src_w2v_file'] trg_w2v_file = config[data_path]['trg_w2v_file'] correct_label, src_label, src_text, correct_index = dataset.load_binary_score_file( test_src_file) trg_text = dataset.load(test_trg_file) slice_size = len(correct_label) // valid_num correct_label, src_label, src_text, trg_text, correct_index = gridsearch.shuffle_list( correct_label, src_label, src_text, trg_text, correct_index) correct_label = gridsearch.slice_list(correct_label, slice_size) src_label = gridsearch.slice_list(src_label, slice_size) src_text = gridsearch.slice_list(src_text, slice_size) trg_text = gridsearch.slice_list(trg_text, slice_size) correct_index = gridsearch.slice_list(correct_index, slice_size) evaluater = evaluate.Evaluate() cross_valid_result = [] for ite in range(1, valid_num + 1): model_valid_dir = model_dir + 'valid{}/'.format(ite) if not os.path.exists(model_valid_dir): os.mkdir(model_valid_dir) index = ite - 1 c_label_train, c_label_dev, c_label_test = gridsearch.split_train_dev_test( correct_label, index) label_train, label_dev, label_test = gridsearch.split_train_dev_test( src_label, index) src_train, src_dev, src_test = gridsearch.split_train_dev_test( src_text, index) trg_train, trg_dev, trg_test = gridsearch.split_train_dev_test( trg_text, index) c_index_train, c_index_dev, c_index_test = gridsearch.split_train_dev_test( correct_index, index) """VOCABULARY""" src_vocab, trg_vocab, sos, eos = dataset.prepare_vocab( model_valid_dir, vocab_type, src_train, trg_train, vocab_size, gpu_id) src_vocab_size = len(src_vocab.vocab) trg_vocab_size = len(trg_vocab.vocab) src_initialW = None trg_initialW = None if pretrain_w2v: w2v = word2vec.Word2Vec() src_initialW, vector_size, src_match_word_count = w2v.make_initialW( src_vocab.vocab, src_w2v_file) trg_initialW, vector_size, trg_match_word_count = w2v.make_initialW( trg_vocab.vocab, trg_w2v_file) logger.info( 'Initialize w2v embedding. Match: src {}/{}, trg {}/{}'.format( src_match_word_count, src_vocab_size, trg_match_word_count, trg_vocab_size)) """ITERATOR""" train_iter = dataset.Iterator(src_train, label_train, trg_train, src_vocab, trg_vocab, batch_size, gpu_id, sort=True, shuffle=True) # train_iter = dataset.Iterator(src_train, label_train, trg_train, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) dev_iter = dataset.Iterator(src_dev, label_dev, trg_dev, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) test_iter = dataset.Iterator(src_test, label_test, trg_test, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) logger.info( 'V{} ## train:{}, dev:{}, test:{}, src_vocab:{}, trg_vocab:{}'. format(ite, len(label_train), len(label_dev), len(label_test), src_vocab_size, trg_vocab_size)) """MODEL""" if model_type == 'multi': model = model.Multi(src_vocab_size, trg_vocab_size, embed_size, hidden_size, class_size, dropout_ratio, coefficient, src_initialW, trg_initialW) elif model_type in ['label', 'pretrain']: model = model.Label(src_vocab_size, trg_vocab_size, embed_size, hidden_size, class_size, dropout_ratio, src_initialW, trg_initialW) else: model = model.EncoderDecoder(src_vocab_size, trg_vocab_size, embed_size, hidden_size, dropout_ratio, src_initialW, trg_initialW) """OPTIMIZER""" optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(gradclip)) optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay)) """GPU""" if gpu_id >= 0: chainer.cuda.get_device_from_id(gpu_id).use() model.to_gpu() """PRETRAIN""" if model_type == 'pretrain': logger.info('Pre-train start') logger.info('train size: {}, valid size: {}'.format( len(label_train), len(label_dev))) pretrain_loss_dic = {} for epoch in range(1, pretrain_epoch + 1): train_loss = 0 for i, batch in enumerate(train_iter.generate(), start=1): try: loss = model.pretrain(*batch) train_loss += loss.data optimizer.target.cleargrads() loss.backward() optimizer.update() except Exception as e: logger.info('V{} ## P{} ## train iter: {}, {}'.format( ite, epoch, i, e)) chainer.serializers.save_npz( model_valid_dir + 'p_model_epoch_{}.npz'.format(epoch), model) """EVALUATE""" valid_loss = 0 for batch in dev_iter.generate(): with chainer.no_backprop_mode(), chainer.using_config( 'train', False): valid_loss += model.pretrain(*batch).data logger.info('V{} ## P{} ## train loss: {}, val loss:{}'.format( ite, epoch, train_loss, valid_loss)) pretrain_loss_dic[epoch] = valid_loss """MODEL SAVE""" best_epoch = min(pretrain_loss_dic, key=(lambda x: pretrain_loss_dic[x])) logger.info('best_epoch:{}, val loss: {}'.format( best_epoch, pretrain_loss_dic[best_epoch])) shutil.copyfile( model_valid_dir + 'p_model_epoch_{}.npz'.format(best_epoch), model_valid_dir + 'p_best_model.npz') logger.info('Pre-train finish') """TRAIN""" accuracy_dic = {} for epoch in range(1, n_epoch + 1): train_loss = 0 for i, batch in enumerate(train_iter.generate(), start=1): try: loss = optimizer.target(*batch) train_loss += loss.data optimizer.target.cleargrads() loss.backward() optimizer.update() except Exception as e: logger.info('V{} ## E{} ## train iter: {}, {}'.format( ite, epoch, i, e)) chainer.serializers.save_npz( model_valid_dir + 'model_epoch_{}.npz'.format(epoch), model) """DEV""" outputs = [] labels = [] alignments = [] for i, batch in enumerate(dev_iter.generate(), start=1): try: with chainer.no_backprop_mode(), chainer.using_config( 'train', False): output, label, align = model.predict( batch[0], sos, eos) except Exception as e: logger.info('V{} ## E{} ## dev iter: {}, {}'.format( ite, epoch, i, e)) if model_type == 'multi': for o, l, a in zip(output, label, align): outputs.append( trg_vocab.id2word(chainer.cuda.to_cpu(o))) labels.append(chainer.cuda.to_cpu(l)) alignments.append(chainer.cuda.to_cpu(a)) elif model_type in ['label', 'pretrain']: for l in label: labels.append(chainer.cuda.to_cpu(l)) else: for o, a in zip(output, align): outputs.append( trg_vocab.id2word(chainer.cuda.to_cpu(o))) alignments.append(chainer.cuda.to_cpu(a)) if model_type == 'encdec': best_param_dic = evaluater.param_search( alignments, [], c_label_dev) else: best_param_dic = evaluater.param_search( labels, alignments, c_label_dev) param = max(best_param_dic, key=lambda x: best_param_dic[x]) init, mix = evaluate.key_to_param(param) dev_score = round(best_param_dic[param], 3) """TEST""" outputs = [] labels = [] alignments = [] for i, batch in enumerate(test_iter.generate(), start=1): try: with chainer.no_backprop_mode(), chainer.using_config( 'train', False): output, label, align = model.predict( batch[0], sos, eos) except Exception as e: logger.info('V{} ## E{} ## test iter: {}, {}'.format( ite, epoch, i, e)) if model_type == 'multi': for o, l, a in zip(output, label, align): outputs.append( trg_vocab.id2word(chainer.cuda.to_cpu(o))) labels.append(chainer.cuda.to_cpu(l)) alignments.append(chainer.cuda.to_cpu(a)) elif model_type in ['label', 'pretrain']: for l in label: labels.append(chainer.cuda.to_cpu(l)) else: for o, a in zip(output, align): outputs.append( trg_vocab.id2word(chainer.cuda.to_cpu(o))) alignments.append(chainer.cuda.to_cpu(a)) if model_type in ['multi', 'label', 'pretrain']: s_rate, s_count, _, _, s_result = evaluater.eval_param( labels, alignments, c_label_test, c_index_test, init, mix) else: s_rate, s_count, _, _, s_result = evaluater.eval_param( alignments, [], c_label_test, c_index_test, init, mix) test_score = round(s_rate[-1], 3) logger.info('V{} ## E{} ## loss:{}, dev: {}, test: {}'.format( ite, epoch, train_loss, dev_score, test_score)) dataset.save_output(model_valid_dir, epoch, labels, alignments, outputs, s_result) accuracy_dic[epoch] = [ epoch, dev_score, test_score, param, s_rate, s_result ] """MODEL SAVE""" best_epoch = max(accuracy_dic, key=(lambda x: accuracy_dic[x][1])) cross_valid_result.append(accuracy_dic[best_epoch]) logger.info('V{} ## best_epoch:{}, dev:{}, test:{}'.format( ite, best_epoch, accuracy_dic[best_epoch][1], accuracy_dic[best_epoch][2])) shutil.copyfile( model_valid_dir + 'model_epoch_{}.npz'.format(best_epoch), model_valid_dir + 'best_model.npz') logger.info('') average_dev_score = 0 average_test_score = [0 for _ in range(len(cross_valid_result[0][4]))] s_result_total = [] for i, r in enumerate(cross_valid_result, start=1): epoch = r[0] dev_score = r[1] param = r[3] test_score_list = [round(rr, 3) for rr in r[4]] s_result = r[5] average_dev_score += dev_score average_test_score = [ average_test_score[i] + test_score_list[i] for i in range(len(average_test_score)) ] logger.info(' {}: epoch{}, {}\t{}'.format( i, epoch, param, ' '.join(dataset.float_to_str(test_score_list)))) s_result_total.extend(s_result) average_dev_score = round(average_dev_score / len(cross_valid_result), 3) average_test_score = [ round(average_test_score[i] / len(cross_valid_result), 3) for i in range(len(average_test_score)) ] logger.info('dev: {}, test: {}'.format( average_dev_score, ' '.join(dataset.float_to_str(average_test_score)))) with open(model_dir + 's_res.txt', 'w') as f: [ f.write('{}\n'.format(l[1])) for l in sorted(s_result_total, key=lambda x: x[0]) ]
def main(): args = parse_args() config = configparser.ConfigParser() """ARGS DETAIL""" config_file = args.config_file batch_size = args.batch n_epoch = args.epoch pretrain_epoch = args.pretrain_epoch gpu_id = args.gpu model_type = args.model vocab_type = args.vocab pretrain_w2v = args.pretrain_w2v data_path = args.data_path load_model = args.load_model """DIR PREPARE""" config.read(config_file) vocab_size = int(config['Parameter']['vocab_size']) coefficient = float(config['Parameter']['coefficient']) vocab_name = vocab_type if pretrain_w2v: vocab_name = 'p' + vocab_name if model_type == 'multi': base_dir = './{}_{}{}_{}_c{}/'.format(model_type, vocab_name, vocab_size, data_path[0], coefficient) else: base_dir = './{}_{}{}_{}/'.format(model_type, vocab_name, vocab_size, data_path[0]) model_save_dir = base_dir if not os.path.exists(base_dir): os.mkdir(base_dir) shutil.copyfile(config_file, base_dir + config_file) config_file = base_dir + config_file config.read(config_file) if load_model is not None: model_save_dir = base_dir + load_model.replace('.npz', '') + '/' if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) """PARAMATER""" embed_size = int(config['Parameter']['embed_size']) hidden_size = int(config['Parameter']['hidden_size']) class_size = int(config['Parameter']['class_size']) dropout_ratio = float(config['Parameter']['dropout']) weight_decay = float(config['Parameter']['weight_decay']) gradclip = float(config['Parameter']['gradclip']) vocab_size = int(config['Parameter']['vocab_size']) coefficient = float(config['Parameter']['coefficient']) valid_num = int(config['Parameter']['valid_num']) """LOGGER""" log_file = model_save_dir + 'log.txt' logger = dataset.prepare_logger(log_file) logger.info(args) # 引数を記録 logger.info('[Training start] logging to {}'.format(log_file)) """DATASET""" train_src_file = config[data_path]['train_src_file'] train_trg_file = config[data_path]['train_trg_file'] valid_src_file = config[data_path]['valid_src_file'] valid_trg_file = config[data_path]['valid_trg_file'] test_src_file = config[data_path]['single_src_file'] test_trg_file = config[data_path]['single_trg_file'] src_w2v_file = config[data_path]['src_w2v_file'] trg_w2v_file = config[data_path]['trg_w2v_file'] train_data_size = dataset.data_size(train_src_file) valid_data_size = dataset.data_size(valid_src_file) logger.info('train size: {}, valid size: {}'.format( train_data_size, valid_data_size)) """VOCABULARY""" src_vocab, trg_vocab, sos, eos = dataset.prepare_vocab( base_dir, vocab_type, train_src_file, train_trg_file, vocab_size, gpu_id) src_vocab_size = len(src_vocab.vocab) trg_vocab_size = len(trg_vocab.vocab) src_initialW = None trg_initialW = None if pretrain_w2v: w2v = word2vec.Word2Vec() src_initialW, vector_size, src_match_word_count = w2v.make_initialW( src_vocab.vocab, src_w2v_file) trg_initialW, vector_size, trg_match_word_count = w2v.make_initialW( trg_vocab.vocab, trg_w2v_file) logger.info( 'Initialize w2v embedding. Match: src {}/{}, trg {}/{}'.format( src_match_word_count, src_vocab_size, trg_match_word_count, trg_vocab_size)) logger.info('src_vocab size: {}, trg_vocab size: {}'.format( src_vocab_size, trg_vocab_size)) """ITERATOR""" _, src_label, src_text, _ = dataset.load_binary_score_file(train_src_file) trg_text = dataset.load(train_trg_file) train_iter = dataset.Iterator(src_text, src_label, trg_text, src_vocab, trg_vocab, batch_size, gpu_id, sort=True, shuffle=True) # train_iter = dataset.Iterator(src_text, src_label, trg_text, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) _, src_label, src_text, _ = dataset.load_binary_score_file(valid_src_file) trg_text = dataset.load(valid_trg_file) valid_iter = dataset.Iterator(src_text, src_label, trg_text, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) correct_label, correct_binary_label, correct_text, correct_index = dataset.load_binary_score_file( test_src_file) trg_text = dataset.load(test_trg_file) test_iter = dataset.Iterator(correct_text, correct_binary_label, trg_text, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False) """MODEL""" if model_type == 'multi': model = model.Multi(src_vocab_size, trg_vocab_size, embed_size, hidden_size, class_size, dropout_ratio, coefficient, src_initialW, trg_initialW) elif model_type in ['label', 'pretrain']: model = model.Label(src_vocab_size, trg_vocab_size, embed_size, hidden_size, class_size, dropout_ratio, src_initialW, trg_initialW) else: model = model.EncoderDecoder(src_vocab_size, trg_vocab_size, embed_size, hidden_size, dropout_ratio, src_initialW, trg_initialW) gridsearcher = gridsearch.GridSearch(valid_num) """OPTIMIZER""" optimizer = chainer.optimizers.Adam() optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(gradclip)) optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay)) """GPU""" if gpu_id >= 0: logger.info('Use GPU') chainer.cuda.get_device_from_id(gpu_id).use() model.to_gpu() """PRETRAIN""" if model_type == 'pretrain' and load_model is None: logger.info('Pre-train start') pretrain_loss_dic = {} for epoch in range(1, pretrain_epoch + 1): train_loss = 0 for i, batch in enumerate(train_iter.generate(), start=1): try: loss = model.pretrain(*batch) train_loss += loss.data optimizer.target.cleargrads() loss.backward() optimizer.update() except Exception as e: logger.info('P{} ## train iter: {}, {}'.format( epoch, i, e)) # with open(model_dir + 'error_log.txt', 'a')as f: # f.write('P{} ## train iter {}\n'.format(epoch, i)) # f.write(traceback.format_exc()) # f.write('P{} ## [batch detail]\n'.format(epoch)) # for b in batch[0]: # [f.write(src_vocab.id2word(chainer.cuda.to_cpu(bb)) + '\n') for bb in b] chainer.serializers.save_npz( model_save_dir + 'p_model_epoch_{}.npz'.format(epoch), model) """EVALUATE""" valid_loss = 0 for batch in valid_iter.generate(): with chainer.no_backprop_mode(), chainer.using_config( 'train', False): valid_loss += model.pretrain(*batch).data logger.info('P{} ## train loss: {}, val loss:{}'.format( epoch, train_loss, valid_loss)) pretrain_loss_dic[epoch] = valid_loss """MODEL SAVE & LOAD""" best_epoch = min(pretrain_loss_dic, key=(lambda x: pretrain_loss_dic[x])) logger.info('best_epoch:{}, val loss: {}'.format( best_epoch, pretrain_loss_dic[best_epoch])) shutil.copyfile( model_save_dir + 'p_model_epoch_{}.npz'.format(best_epoch), model_save_dir + 'p_best_model.npz') logger.info('Pre-train finish') if load_model: logger.info('load model: {}'.format(load_model)) chainer.serializers.load_npz(model_save_dir + load_model, model) """TRAIN""" accuracy_dic = {} for epoch in range(1, n_epoch + 1): train_loss = 0 for i, batch in enumerate(train_iter.generate(), start=1): try: loss = optimizer.target(*batch) train_loss += loss.data optimizer.target.cleargrads() loss.backward() optimizer.update() except Exception as e: logger.info('E{} ## train iter: {}, {}'.format(epoch, i, e)) # with open(model_dir + 'error_log.txt', 'a')as f: # f.write('E{} ## train iter: {}\n'.format(epoch, i)) # f.write(traceback.format_exc()) # f.write('E{} ## [batch detail]\n'.format(epoch)) # for b in batch[0]: # [f.write(src_vocab.id2word(chainer.cuda.to_cpu(bb)) + '\n') for bb in b] chainer.serializers.save_npz( model_save_dir + 'model_epoch_{}.npz'.format(epoch), model) """DEV & TEST""" outputs = [] labels = [] alignments = [] for i, batch in enumerate(test_iter.generate(), start=1): try: with chainer.no_backprop_mode(), chainer.using_config( 'train', False): output, label, align = model.predict(batch[0], sos, eos) except Exception as e: logger.info('E{} ## test iter: {}, {}'.format(epoch, i, e)) # with open(model_dir + 'error_log.txt', 'a')as f: # f.write('E{} ## test iter: {}\n'.format(epoch, i)) # f.write(traceback.format_exc()) # f.write('E{} ## [batch detail]\n'.format(epoch)) # for b in batch[0]: # [f.write(src_vocab.id2word(chainer.cuda.to_cpu(bb)) + '\n') for bb in b] if model_type == 'multi': for o, l, a in zip(output, label, align): outputs.append(trg_vocab.id2word(chainer.cuda.to_cpu(o))) labels.append(chainer.cuda.to_cpu(l)) alignments.append(chainer.cuda.to_cpu(a)) elif model_type in ['label', 'pretrain']: for l in label: labels.append(chainer.cuda.to_cpu(l)) else: for o, a in zip(output, align): outputs.append(trg_vocab.id2word(chainer.cuda.to_cpu(o))) alignments.append(chainer.cuda.to_cpu(a)) if model_type in ['multi', 'label', 'pretrain']: dev_score, test_score, param_list, test_score_list, s_result_list = gridsearcher.gridsearch( correct_label, correct_index, labels, alignments) else: dev_score, test_score, param_list, test_score_list, s_result_list = gridsearcher.gridsearch( correct_label, correct_index, alignments, []) accuracy_dic[epoch] = [dev_score, test_score] # log保存 logger.info('E{} ## loss:{}, dev: {}, test: {}'.format( epoch, train_loss, dev_score, test_score)) logger.info('E{} ## {}'.format( epoch, ' '.join(dataset.float_to_str(test_score_list[-1])))) for i, (l, p) in enumerate(zip(test_score_list[:-1], param_list), start=1): logger.info('E{} ## {}: {}\t{}'.format( epoch, i, p, ' '.join(dataset.float_to_str(l)))) # 結果保存 dataset.save_output(model_save_dir, epoch, labels, alignments, outputs, s_result_list) """MODEL SAVE""" best_epoch = max(accuracy_dic, key=(lambda x: accuracy_dic[x][0])) logger.info('best_epoch:{}, dev: {}, test: {}, {}'.format( best_epoch, accuracy_dic[best_epoch][0], accuracy_dic[best_epoch][1], model_dir)) shutil.copyfile(model_save_dir + 'model_epoch_{}.npz'.format(best_epoch), model_save_dir + 'best_model.npz')
def main(): args = parse_args() model_file = args.model_file model_dir = re.search(r'(.*/)', model_file).group(1) """LOAD CONFIG FILE""" config_files = glob.glob(os.path.join(model_dir, '*.ini')) assert len(config_files) == 1, 'Put only one config file in the directory' config_file = config_files[0] config = configparser.ConfigParser() config.read(config_file) """LOGGER""" logger = getLogger(__name__) logger.setLevel(logging.INFO) formatter = logging.Formatter('[%(asctime)s] %(message)s') sh = logging.StreamHandler() sh.setLevel(logging.INFO) sh.setFormatter(formatter) logger.addHandler(sh) log_file = model_dir + 'test_log.txt' fh = logging.FileHandler(log_file) fh.setLevel(logging.INFO) fh.setFormatter(formatter) logger.addHandler(fh) logger.info('[Test start] logging to {}'.format(log_file)) """PARAMATER""" embed_size = int(config['Parameter']['embed_size']) hidden_size = int(config['Parameter']['hidden_size']) class_size = int(config['Parameter']['class_size']) dropout_ratio = float(config['Parameter']['dropout']) coefficient = float(config['Parameter']['coefficient']) """TEST DETAIL""" gpu_id = args.gpu batch_size = args.batch data = model_dir.split('/')[-2].split('_') model_type = data[0] if 'normal' in data[1]: vocab_type = 'normal' else: vocab_type = 'subword' if data[2] == 's': data_path = 'server' else: data_path = 'local' """DATASET""" test_src_file = config[data_path]['test_src_file'] row_score_file = config[data_path]['row_score_file'] row_score = dataset.load_score_file(row_score_file) test_data_size = dataset.data_size(test_src_file) logger.info('test size: {}'.format(test_data_size)) if vocab_type == 'normal': src_vocab = dataset.VocabNormal() src_vocab.load(model_dir + 'src_vocab.normal.pkl') src_vocab.set_reverse_vocab() trg_vocab = dataset.VocabNormal() trg_vocab.load(model_dir + 'trg_vocab.normal.pkl') trg_vocab.set_reverse_vocab() sos = convert.convert_list( np.array([src_vocab.vocab['<s>']], dtype=np.int32), gpu_id) eos = convert.convert_list( np.array([src_vocab.vocab['</s>']], dtype=np.int32), gpu_id) elif vocab_type == 'subword': src_vocab = dataset.VocabSubword() src_vocab.load(model_dir + 'src_vocab.sub.model') trg_vocab = dataset.VocabSubword() trg_vocab.load(model_dir + 'trg_vocab.sub.model') sos = convert.convert_list( np.array([src_vocab.vocab.PieceToId('<s>')], dtype=np.int32), gpu_id) eos = convert.convert_list( np.array([src_vocab.vocab.PieceToId('</s>')], dtype=np.int32), gpu_id) src_vocab_size = len(src_vocab.vocab) trg_vocab_size = len(trg_vocab.vocab) logger.info('src_vocab size: {}, trg_vocab size: {}'.format( src_vocab_size, trg_vocab_size)) test_iter = dataset.Iterator(test_src_file, test_src_file, src_vocab, trg_vocab, batch_size, sort=False, shuffle=False) gridsearcher = gridsearch.GridSearch(test_src_file) """MODEL""" if model_type == 'multi': model = model_reg.Multi(src_vocab_size, trg_vocab_size, embed_size, hidden_size, class_size, dropout_ratio, coefficient) elif model_type in ['label', 'pretrain']: model = model_reg.Label(src_vocab_size, trg_vocab_size, embed_size, hidden_size, class_size, dropout_ratio) else: model = model_reg.EncoderDecoder(src_vocab_size, trg_vocab_size, embed_size, hidden_size, dropout_ratio) chainer.serializers.load_npz(model_file, model) """GPU""" if gpu_id >= 0: logger.info('Use GPU') chainer.cuda.get_device_from_id(gpu_id).use() model.to_gpu() """TEST""" epoch = 'T' outputs = [] labels = [] alignments = [] for i, batch in enumerate(test_iter.generate(), start=1): with chainer.no_backprop_mode(), chainer.using_config('train', False): output, label, align = model.predict(batch[0], sos, eos) for o in output: outputs.append(trg_vocab.id2word(chainer.cuda.to_cpu(o))) for l in label: labels.append(chainer.cuda.to_cpu(l)) for a in align: alignments.append(chainer.cuda.to_cpu(a)) model_file = model_file[:-3] if model_type == 'multi': score = gridsearcher.gridsearch(labels, alignments) logger.info('E{} ## {}'.format(epoch, score[0])) logger.info('E{} ## {}'.format(epoch, score[1])) with open(model_file + 'label.T', 'w') as f: [f.write('{}\n'.format(l)) for l in labels] with open(model_file + '.hypo.T', 'w') as f: [f.write(o + '\n') for o in outputs] with open(model_file + '.align.T', 'w') as f: [f.write('{}\n'.format(a)) for a in alignments] elif model_type in ['label', 'pretrain']: score = gridsearcher.gridsearch(labels, alignments) logger.info('E{} ## {}'.format(epoch, score[0])) logger.info('E{} ## {}'.format(epoch, score[1])) with open(model_file + 'label.T', 'w') as f: [f.write('{}\n'.format(l)) for l in labels] else: score = gridsearcher.gridsearch(row_score, alignments) logger.info('E{} ## {}'.format(epoch, score[0])) logger.info('E{} ## {}'.format(epoch, score[1])) with open(model_file + '.hypo.T', 'w') as f: [f.write(o + '\n') for o in outputs] with open(model_file + '.align.T', 'w') as f: [f.write('{}\n'.format(a)) for a in alignments]