Beispiel #1
0
def main():
    args = parse_args()
    config = configparser.ConfigParser()
    """ARGS DETAIL"""
    config_file = args.config_file
    batch_size = args.batch
    n_epoch = args.epoch
    pretrain_epoch = args.pretrain_epoch
    gpu_id = args.gpu
    model_type = args.model
    vocab_type = args.vocab
    pretrain_w2v = args.pretrain_w2v
    data_path = args.data_path
    """DIR PREPARE"""
    config.read(config_file)
    vocab_size = int(config['Parameter']['vocab_size'])
    coefficient = float(config['Parameter']['coefficient'])

    vocab_name = vocab_type
    if pretrain_w2v:
        vocab_name = 'p' + vocab_name

    if model_type == 'multi':
        model_dir = './super_{}_{}{}_{}_c{}/'.format(model_type, vocab_name,
                                                     vocab_size, data_path[0],
                                                     coefficient)
    else:
        model_dir = './super_{}_{}{}_{}/'.format(model_type, vocab_name,
                                                 vocab_size, data_path[0])

    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
        shutil.copyfile(config_file, model_dir + config_file)
    config_file = model_dir + config_file
    config.read(config_file)
    """PARAMATER"""
    embed_size = int(config['Parameter']['embed_size'])
    hidden_size = int(config['Parameter']['hidden_size'])
    class_size = int(config['Parameter']['class_size'])
    dropout_ratio = float(config['Parameter']['dropout'])
    weight_decay = float(config['Parameter']['weight_decay'])
    gradclip = float(config['Parameter']['gradclip'])
    vocab_size = int(config['Parameter']['vocab_size'])
    coefficient = float(config['Parameter']['coefficient'])
    valid_num = int(config['Parameter']['valid_num'])
    """LOGGER"""
    log_file = model_dir + 'log.txt'
    logger = dataset.prepare_logger(log_file)

    logger.info(args)  # 引数を記録
    logger.info('[Training start] logging to {}'.format(log_file))
    """DATASET"""
    train_src_file = config[data_path]['train_src_file']
    train_trg_file = config[data_path]['train_trg_file']
    valid_src_file = config[data_path]['valid_src_file']
    valid_trg_file = config[data_path]['valid_trg_file']
    test_src_file = config[data_path]['single_src_file']
    test_trg_file = config[data_path]['single_trg_file']
    src_w2v_file = config[data_path]['src_w2v_file']
    trg_w2v_file = config[data_path]['trg_w2v_file']

    correct_label, src_label, src_text, correct_index = dataset.load_binary_score_file(
        test_src_file)
    trg_text = dataset.load(test_trg_file)
    slice_size = len(correct_label) // valid_num
    correct_label, src_label, src_text, trg_text, correct_index = gridsearch.shuffle_list(
        correct_label, src_label, src_text, trg_text, correct_index)

    correct_label = gridsearch.slice_list(correct_label, slice_size)
    src_label = gridsearch.slice_list(src_label, slice_size)
    src_text = gridsearch.slice_list(src_text, slice_size)
    trg_text = gridsearch.slice_list(trg_text, slice_size)
    correct_index = gridsearch.slice_list(correct_index, slice_size)

    evaluater = evaluate.Evaluate()

    cross_valid_result = []
    for ite in range(1, valid_num + 1):
        model_valid_dir = model_dir + 'valid{}/'.format(ite)
        if not os.path.exists(model_valid_dir):
            os.mkdir(model_valid_dir)

        index = ite - 1
        c_label_train, c_label_dev, c_label_test = gridsearch.split_train_dev_test(
            correct_label, index)
        label_train, label_dev, label_test = gridsearch.split_train_dev_test(
            src_label, index)
        src_train, src_dev, src_test = gridsearch.split_train_dev_test(
            src_text, index)
        trg_train, trg_dev, trg_test = gridsearch.split_train_dev_test(
            trg_text, index)
        c_index_train, c_index_dev, c_index_test = gridsearch.split_train_dev_test(
            correct_index, index)
        """VOCABULARY"""
        src_vocab, trg_vocab, sos, eos = dataset.prepare_vocab(
            model_valid_dir, vocab_type, src_train, trg_train, vocab_size,
            gpu_id)
        src_vocab_size = len(src_vocab.vocab)
        trg_vocab_size = len(trg_vocab.vocab)

        src_initialW = None
        trg_initialW = None

        if pretrain_w2v:
            w2v = word2vec.Word2Vec()
            src_initialW, vector_size, src_match_word_count = w2v.make_initialW(
                src_vocab.vocab, src_w2v_file)
            trg_initialW, vector_size, trg_match_word_count = w2v.make_initialW(
                trg_vocab.vocab, trg_w2v_file)
            logger.info(
                'Initialize w2v embedding. Match: src {}/{}, trg {}/{}'.format(
                    src_match_word_count, src_vocab_size, trg_match_word_count,
                    trg_vocab_size))
        """ITERATOR"""
        train_iter = dataset.Iterator(src_train,
                                      label_train,
                                      trg_train,
                                      src_vocab,
                                      trg_vocab,
                                      batch_size,
                                      gpu_id,
                                      sort=True,
                                      shuffle=True)
        # train_iter = dataset.Iterator(src_train, label_train, trg_train, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False)
        dev_iter = dataset.Iterator(src_dev,
                                    label_dev,
                                    trg_dev,
                                    src_vocab,
                                    trg_vocab,
                                    batch_size,
                                    gpu_id,
                                    sort=False,
                                    shuffle=False)
        test_iter = dataset.Iterator(src_test,
                                     label_test,
                                     trg_test,
                                     src_vocab,
                                     trg_vocab,
                                     batch_size,
                                     gpu_id,
                                     sort=False,
                                     shuffle=False)

        logger.info(
            'V{} ## train:{}, dev:{}, test:{}, src_vocab:{}, trg_vocab:{}'.
            format(ite, len(label_train), len(label_dev), len(label_test),
                   src_vocab_size, trg_vocab_size))
        """MODEL"""
        if model_type == 'multi':
            model = model.Multi(src_vocab_size, trg_vocab_size, embed_size,
                                hidden_size, class_size, dropout_ratio,
                                coefficient, src_initialW, trg_initialW)
        elif model_type in ['label', 'pretrain']:
            model = model.Label(src_vocab_size, trg_vocab_size, embed_size,
                                hidden_size, class_size, dropout_ratio,
                                src_initialW, trg_initialW)
        else:
            model = model.EncoderDecoder(src_vocab_size, trg_vocab_size,
                                         embed_size, hidden_size,
                                         dropout_ratio, src_initialW,
                                         trg_initialW)
        """OPTIMIZER"""
        optimizer = chainer.optimizers.Adam()
        optimizer.setup(model)
        optimizer.add_hook(chainer.optimizer.GradientClipping(gradclip))
        optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay))
        """GPU"""
        if gpu_id >= 0:
            chainer.cuda.get_device_from_id(gpu_id).use()
            model.to_gpu()
        """PRETRAIN"""
        if model_type == 'pretrain':
            logger.info('Pre-train start')
            logger.info('train size: {}, valid size: {}'.format(
                len(label_train), len(label_dev)))
            pretrain_loss_dic = {}
            for epoch in range(1, pretrain_epoch + 1):
                train_loss = 0
                for i, batch in enumerate(train_iter.generate(), start=1):
                    try:
                        loss = model.pretrain(*batch)
                        train_loss += loss.data
                        optimizer.target.cleargrads()
                        loss.backward()
                        optimizer.update()

                    except Exception as e:
                        logger.info('V{} ## P{} ## train iter: {}, {}'.format(
                            ite, epoch, i, e))

                chainer.serializers.save_npz(
                    model_valid_dir + 'p_model_epoch_{}.npz'.format(epoch),
                    model)
                """EVALUATE"""
                valid_loss = 0
                for batch in dev_iter.generate():
                    with chainer.no_backprop_mode(), chainer.using_config(
                            'train', False):
                        valid_loss += model.pretrain(*batch).data
                logger.info('V{} ## P{} ## train loss: {}, val loss:{}'.format(
                    ite, epoch, train_loss, valid_loss))
                pretrain_loss_dic[epoch] = valid_loss
            """MODEL SAVE"""
            best_epoch = min(pretrain_loss_dic,
                             key=(lambda x: pretrain_loss_dic[x]))
            logger.info('best_epoch:{}, val loss: {}'.format(
                best_epoch, pretrain_loss_dic[best_epoch]))
            shutil.copyfile(
                model_valid_dir + 'p_model_epoch_{}.npz'.format(best_epoch),
                model_valid_dir + 'p_best_model.npz')
            logger.info('Pre-train finish')
        """TRAIN"""
        accuracy_dic = {}
        for epoch in range(1, n_epoch + 1):
            train_loss = 0
            for i, batch in enumerate(train_iter.generate(), start=1):
                try:
                    loss = optimizer.target(*batch)
                    train_loss += loss.data
                    optimizer.target.cleargrads()
                    loss.backward()
                    optimizer.update()

                except Exception as e:
                    logger.info('V{} ## E{} ## train iter: {}, {}'.format(
                        ite, epoch, i, e))
            chainer.serializers.save_npz(
                model_valid_dir + 'model_epoch_{}.npz'.format(epoch), model)
            """DEV"""
            outputs = []
            labels = []
            alignments = []
            for i, batch in enumerate(dev_iter.generate(), start=1):
                try:
                    with chainer.no_backprop_mode(), chainer.using_config(
                            'train', False):
                        output, label, align = model.predict(
                            batch[0], sos, eos)
                except Exception as e:
                    logger.info('V{} ## E{} ## dev iter: {}, {}'.format(
                        ite, epoch, i, e))

                if model_type == 'multi':
                    for o, l, a in zip(output, label, align):
                        outputs.append(
                            trg_vocab.id2word(chainer.cuda.to_cpu(o)))
                        labels.append(chainer.cuda.to_cpu(l))
                        alignments.append(chainer.cuda.to_cpu(a))
                elif model_type in ['label', 'pretrain']:
                    for l in label:
                        labels.append(chainer.cuda.to_cpu(l))
                else:
                    for o, a in zip(output, align):
                        outputs.append(
                            trg_vocab.id2word(chainer.cuda.to_cpu(o)))
                        alignments.append(chainer.cuda.to_cpu(a))

            if model_type == 'encdec':
                best_param_dic = evaluater.param_search(
                    alignments, [], c_label_dev)
            else:
                best_param_dic = evaluater.param_search(
                    labels, alignments, c_label_dev)
            param = max(best_param_dic, key=lambda x: best_param_dic[x])
            init, mix = evaluate.key_to_param(param)
            dev_score = round(best_param_dic[param], 3)
            """TEST"""
            outputs = []
            labels = []
            alignments = []
            for i, batch in enumerate(test_iter.generate(), start=1):
                try:
                    with chainer.no_backprop_mode(), chainer.using_config(
                            'train', False):
                        output, label, align = model.predict(
                            batch[0], sos, eos)
                except Exception as e:
                    logger.info('V{} ## E{} ## test iter: {}, {}'.format(
                        ite, epoch, i, e))
                if model_type == 'multi':
                    for o, l, a in zip(output, label, align):
                        outputs.append(
                            trg_vocab.id2word(chainer.cuda.to_cpu(o)))
                        labels.append(chainer.cuda.to_cpu(l))
                        alignments.append(chainer.cuda.to_cpu(a))
                elif model_type in ['label', 'pretrain']:
                    for l in label:
                        labels.append(chainer.cuda.to_cpu(l))
                else:
                    for o, a in zip(output, align):
                        outputs.append(
                            trg_vocab.id2word(chainer.cuda.to_cpu(o)))
                        alignments.append(chainer.cuda.to_cpu(a))

            if model_type in ['multi', 'label', 'pretrain']:
                s_rate, s_count, _, _, s_result = evaluater.eval_param(
                    labels, alignments, c_label_test, c_index_test, init, mix)
            else:
                s_rate, s_count, _, _, s_result = evaluater.eval_param(
                    alignments, [], c_label_test, c_index_test, init, mix)
            test_score = round(s_rate[-1], 3)
            logger.info('V{} ## E{} ## loss:{}, dev: {}, test: {}'.format(
                ite, epoch, train_loss, dev_score, test_score))

            dataset.save_output(model_valid_dir, epoch, labels, alignments,
                                outputs, s_result)
            accuracy_dic[epoch] = [
                epoch, dev_score, test_score, param, s_rate, s_result
            ]
        """MODEL SAVE"""
        best_epoch = max(accuracy_dic, key=(lambda x: accuracy_dic[x][1]))
        cross_valid_result.append(accuracy_dic[best_epoch])
        logger.info('V{} ## best_epoch:{}, dev:{}, test:{}'.format(
            ite, best_epoch, accuracy_dic[best_epoch][1],
            accuracy_dic[best_epoch][2]))
        shutil.copyfile(
            model_valid_dir + 'model_epoch_{}.npz'.format(best_epoch),
            model_valid_dir + 'best_model.npz')

        logger.info('')

    average_dev_score = 0
    average_test_score = [0 for _ in range(len(cross_valid_result[0][4]))]
    s_result_total = []
    for i, r in enumerate(cross_valid_result, start=1):
        epoch = r[0]
        dev_score = r[1]
        param = r[3]
        test_score_list = [round(rr, 3) for rr in r[4]]
        s_result = r[5]

        average_dev_score += dev_score
        average_test_score = [
            average_test_score[i] + test_score_list[i]
            for i in range(len(average_test_score))
        ]
        logger.info('   {}: epoch{}, {}\t{}'.format(
            i, epoch, param, ' '.join(dataset.float_to_str(test_score_list))))
        s_result_total.extend(s_result)
    average_dev_score = round(average_dev_score / len(cross_valid_result), 3)
    average_test_score = [
        round(average_test_score[i] / len(cross_valid_result), 3)
        for i in range(len(average_test_score))
    ]
    logger.info('dev: {}, test: {}'.format(
        average_dev_score, ' '.join(dataset.float_to_str(average_test_score))))

    with open(model_dir + 's_res.txt', 'w') as f:
        [
            f.write('{}\n'.format(l[1]))
            for l in sorted(s_result_total, key=lambda x: x[0])
        ]
Beispiel #2
0
def main():
    args = parse_args()
    config = configparser.ConfigParser()
    """ARGS DETAIL"""
    config_file = args.config_file
    batch_size = args.batch
    n_epoch = args.epoch
    pretrain_epoch = args.pretrain_epoch
    gpu_id = args.gpu
    model_type = args.model
    vocab_type = args.vocab
    pretrain_w2v = args.pretrain_w2v
    data_path = args.data_path
    load_model = args.load_model
    """DIR PREPARE"""
    config.read(config_file)
    vocab_size = int(config['Parameter']['vocab_size'])
    coefficient = float(config['Parameter']['coefficient'])

    vocab_name = vocab_type
    if pretrain_w2v:
        vocab_name = 'p' + vocab_name

    if model_type == 'multi':
        base_dir = './{}_{}{}_{}_c{}/'.format(model_type, vocab_name,
                                              vocab_size, data_path[0],
                                              coefficient)
    else:
        base_dir = './{}_{}{}_{}/'.format(model_type, vocab_name, vocab_size,
                                          data_path[0])
    model_save_dir = base_dir

    if not os.path.exists(base_dir):
        os.mkdir(base_dir)
        shutil.copyfile(config_file, base_dir + config_file)
    config_file = base_dir + config_file
    config.read(config_file)

    if load_model is not None:
        model_save_dir = base_dir + load_model.replace('.npz', '') + '/'
        if not os.path.exists(model_save_dir):
            os.mkdir(model_save_dir)
    """PARAMATER"""
    embed_size = int(config['Parameter']['embed_size'])
    hidden_size = int(config['Parameter']['hidden_size'])
    class_size = int(config['Parameter']['class_size'])
    dropout_ratio = float(config['Parameter']['dropout'])
    weight_decay = float(config['Parameter']['weight_decay'])
    gradclip = float(config['Parameter']['gradclip'])
    vocab_size = int(config['Parameter']['vocab_size'])
    coefficient = float(config['Parameter']['coefficient'])
    valid_num = int(config['Parameter']['valid_num'])
    """LOGGER"""
    log_file = model_save_dir + 'log.txt'
    logger = dataset.prepare_logger(log_file)

    logger.info(args)  # 引数を記録
    logger.info('[Training start] logging to {}'.format(log_file))
    """DATASET"""
    train_src_file = config[data_path]['train_src_file']
    train_trg_file = config[data_path]['train_trg_file']
    valid_src_file = config[data_path]['valid_src_file']
    valid_trg_file = config[data_path]['valid_trg_file']
    test_src_file = config[data_path]['single_src_file']
    test_trg_file = config[data_path]['single_trg_file']
    src_w2v_file = config[data_path]['src_w2v_file']
    trg_w2v_file = config[data_path]['trg_w2v_file']

    train_data_size = dataset.data_size(train_src_file)
    valid_data_size = dataset.data_size(valid_src_file)
    logger.info('train size: {}, valid size: {}'.format(
        train_data_size, valid_data_size))
    """VOCABULARY"""
    src_vocab, trg_vocab, sos, eos = dataset.prepare_vocab(
        base_dir, vocab_type, train_src_file, train_trg_file, vocab_size,
        gpu_id)
    src_vocab_size = len(src_vocab.vocab)
    trg_vocab_size = len(trg_vocab.vocab)

    src_initialW = None
    trg_initialW = None

    if pretrain_w2v:
        w2v = word2vec.Word2Vec()
        src_initialW, vector_size, src_match_word_count = w2v.make_initialW(
            src_vocab.vocab, src_w2v_file)
        trg_initialW, vector_size, trg_match_word_count = w2v.make_initialW(
            trg_vocab.vocab, trg_w2v_file)
        logger.info(
            'Initialize w2v embedding. Match: src {}/{}, trg {}/{}'.format(
                src_match_word_count, src_vocab_size, trg_match_word_count,
                trg_vocab_size))

    logger.info('src_vocab size: {}, trg_vocab size: {}'.format(
        src_vocab_size, trg_vocab_size))
    """ITERATOR"""
    _, src_label, src_text, _ = dataset.load_binary_score_file(train_src_file)
    trg_text = dataset.load(train_trg_file)
    train_iter = dataset.Iterator(src_text,
                                  src_label,
                                  trg_text,
                                  src_vocab,
                                  trg_vocab,
                                  batch_size,
                                  gpu_id,
                                  sort=True,
                                  shuffle=True)
    # train_iter = dataset.Iterator(src_text, src_label, trg_text, src_vocab, trg_vocab, batch_size, gpu_id, sort=False, shuffle=False)

    _, src_label, src_text, _ = dataset.load_binary_score_file(valid_src_file)
    trg_text = dataset.load(valid_trg_file)
    valid_iter = dataset.Iterator(src_text,
                                  src_label,
                                  trg_text,
                                  src_vocab,
                                  trg_vocab,
                                  batch_size,
                                  gpu_id,
                                  sort=False,
                                  shuffle=False)

    correct_label, correct_binary_label, correct_text, correct_index = dataset.load_binary_score_file(
        test_src_file)
    trg_text = dataset.load(test_trg_file)
    test_iter = dataset.Iterator(correct_text,
                                 correct_binary_label,
                                 trg_text,
                                 src_vocab,
                                 trg_vocab,
                                 batch_size,
                                 gpu_id,
                                 sort=False,
                                 shuffle=False)
    """MODEL"""
    if model_type == 'multi':
        model = model.Multi(src_vocab_size, trg_vocab_size, embed_size,
                            hidden_size, class_size, dropout_ratio,
                            coefficient, src_initialW, trg_initialW)
    elif model_type in ['label', 'pretrain']:
        model = model.Label(src_vocab_size, trg_vocab_size, embed_size,
                            hidden_size, class_size, dropout_ratio,
                            src_initialW, trg_initialW)
    else:
        model = model.EncoderDecoder(src_vocab_size, trg_vocab_size,
                                     embed_size, hidden_size, dropout_ratio,
                                     src_initialW, trg_initialW)

    gridsearcher = gridsearch.GridSearch(valid_num)
    """OPTIMIZER"""
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(gradclip))
    optimizer.add_hook(chainer.optimizer.WeightDecay(weight_decay))
    """GPU"""
    if gpu_id >= 0:
        logger.info('Use GPU')
        chainer.cuda.get_device_from_id(gpu_id).use()
        model.to_gpu()
    """PRETRAIN"""
    if model_type == 'pretrain' and load_model is None:
        logger.info('Pre-train start')
        pretrain_loss_dic = {}
        for epoch in range(1, pretrain_epoch + 1):
            train_loss = 0
            for i, batch in enumerate(train_iter.generate(), start=1):
                try:
                    loss = model.pretrain(*batch)
                    train_loss += loss.data
                    optimizer.target.cleargrads()
                    loss.backward()
                    optimizer.update()

                except Exception as e:
                    logger.info('P{} ## train iter: {}, {}'.format(
                        epoch, i, e))
                    # with open(model_dir + 'error_log.txt', 'a')as f:
                    #     f.write('P{} ## train iter {}\n'.format(epoch, i))
                    #     f.write(traceback.format_exc())
                    #     f.write('P{} ## [batch detail]\n'.format(epoch))
                    #     for b in batch[0]:
                    #         [f.write(src_vocab.id2word(chainer.cuda.to_cpu(bb)) + '\n') for bb in b]
            chainer.serializers.save_npz(
                model_save_dir + 'p_model_epoch_{}.npz'.format(epoch), model)
            """EVALUATE"""
            valid_loss = 0
            for batch in valid_iter.generate():
                with chainer.no_backprop_mode(), chainer.using_config(
                        'train', False):
                    valid_loss += model.pretrain(*batch).data
            logger.info('P{} ## train loss: {}, val loss:{}'.format(
                epoch, train_loss, valid_loss))
            pretrain_loss_dic[epoch] = valid_loss
        """MODEL SAVE & LOAD"""
        best_epoch = min(pretrain_loss_dic,
                         key=(lambda x: pretrain_loss_dic[x]))
        logger.info('best_epoch:{}, val loss: {}'.format(
            best_epoch, pretrain_loss_dic[best_epoch]))
        shutil.copyfile(
            model_save_dir + 'p_model_epoch_{}.npz'.format(best_epoch),
            model_save_dir + 'p_best_model.npz')
        logger.info('Pre-train finish')

    if load_model:
        logger.info('load model: {}'.format(load_model))
        chainer.serializers.load_npz(model_save_dir + load_model, model)
    """TRAIN"""
    accuracy_dic = {}
    for epoch in range(1, n_epoch + 1):
        train_loss = 0
        for i, batch in enumerate(train_iter.generate(), start=1):
            try:
                loss = optimizer.target(*batch)
                train_loss += loss.data
                optimizer.target.cleargrads()
                loss.backward()
                optimizer.update()

            except Exception as e:
                logger.info('E{} ## train iter: {}, {}'.format(epoch, i, e))
                # with open(model_dir + 'error_log.txt', 'a')as f:
                #     f.write('E{} ## train iter: {}\n'.format(epoch, i))
                #     f.write(traceback.format_exc())
                #     f.write('E{} ## [batch detail]\n'.format(epoch))
                #     for b in batch[0]:
                #         [f.write(src_vocab.id2word(chainer.cuda.to_cpu(bb)) + '\n') for bb in b]
        chainer.serializers.save_npz(
            model_save_dir + 'model_epoch_{}.npz'.format(epoch), model)
        """DEV & TEST"""
        outputs = []
        labels = []
        alignments = []
        for i, batch in enumerate(test_iter.generate(), start=1):
            try:
                with chainer.no_backprop_mode(), chainer.using_config(
                        'train', False):
                    output, label, align = model.predict(batch[0], sos, eos)
            except Exception as e:
                logger.info('E{} ## test iter: {}, {}'.format(epoch, i, e))
                # with open(model_dir + 'error_log.txt', 'a')as f:
                #     f.write('E{} ## test iter: {}\n'.format(epoch, i))
                #     f.write(traceback.format_exc())
                #     f.write('E{} ## [batch detail]\n'.format(epoch))
                #     for b in batch[0]:
                #         [f.write(src_vocab.id2word(chainer.cuda.to_cpu(bb)) + '\n') for bb in b]

            if model_type == 'multi':
                for o, l, a in zip(output, label, align):
                    outputs.append(trg_vocab.id2word(chainer.cuda.to_cpu(o)))
                    labels.append(chainer.cuda.to_cpu(l))
                    alignments.append(chainer.cuda.to_cpu(a))
            elif model_type in ['label', 'pretrain']:
                for l in label:
                    labels.append(chainer.cuda.to_cpu(l))
            else:
                for o, a in zip(output, align):
                    outputs.append(trg_vocab.id2word(chainer.cuda.to_cpu(o)))
                    alignments.append(chainer.cuda.to_cpu(a))

        if model_type in ['multi', 'label', 'pretrain']:
            dev_score, test_score, param_list, test_score_list, s_result_list = gridsearcher.gridsearch(
                correct_label, correct_index, labels, alignments)
        else:
            dev_score, test_score, param_list, test_score_list, s_result_list = gridsearcher.gridsearch(
                correct_label, correct_index, alignments, [])

        accuracy_dic[epoch] = [dev_score, test_score]

        # log保存
        logger.info('E{} ## loss:{}, dev: {}, test: {}'.format(
            epoch, train_loss, dev_score, test_score))
        logger.info('E{} ## {}'.format(
            epoch, ' '.join(dataset.float_to_str(test_score_list[-1]))))
        for i, (l, p) in enumerate(zip(test_score_list[:-1], param_list),
                                   start=1):
            logger.info('E{} ##   {}: {}\t{}'.format(
                epoch, i, p, ' '.join(dataset.float_to_str(l))))

        # 結果保存
        dataset.save_output(model_save_dir, epoch, labels, alignments, outputs,
                            s_result_list)
    """MODEL SAVE"""
    best_epoch = max(accuracy_dic, key=(lambda x: accuracy_dic[x][0]))
    logger.info('best_epoch:{}, dev: {}, test: {}, {}'.format(
        best_epoch, accuracy_dic[best_epoch][0], accuracy_dic[best_epoch][1],
        model_dir))
    shutil.copyfile(model_save_dir + 'model_epoch_{}.npz'.format(best_epoch),
                    model_save_dir + 'best_model.npz')
Beispiel #3
0
def main():
    args = parse_args()
    model_file = args.model_file
    model_dir = re.search(r'(.*/)', model_file).group(1)
    """LOAD CONFIG FILE"""
    config_files = glob.glob(os.path.join(model_dir, '*.ini'))
    assert len(config_files) == 1, 'Put only one config file in the directory'
    config_file = config_files[0]
    config = configparser.ConfigParser()
    config.read(config_file)
    """LOGGER"""
    logger = getLogger(__name__)
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter('[%(asctime)s] %(message)s')

    sh = logging.StreamHandler()
    sh.setLevel(logging.INFO)
    sh.setFormatter(formatter)
    logger.addHandler(sh)

    log_file = model_dir + 'test_log.txt'
    fh = logging.FileHandler(log_file)
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    logger.info('[Test start] logging to {}'.format(log_file))
    """PARAMATER"""
    embed_size = int(config['Parameter']['embed_size'])
    hidden_size = int(config['Parameter']['hidden_size'])
    class_size = int(config['Parameter']['class_size'])
    dropout_ratio = float(config['Parameter']['dropout'])
    coefficient = float(config['Parameter']['coefficient'])
    """TEST DETAIL"""
    gpu_id = args.gpu
    batch_size = args.batch
    data = model_dir.split('/')[-2].split('_')

    model_type = data[0]
    if 'normal' in data[1]:
        vocab_type = 'normal'
    else:
        vocab_type = 'subword'
    if data[2] == 's':
        data_path = 'server'
    else:
        data_path = 'local'
    """DATASET"""
    test_src_file = config[data_path]['test_src_file']
    row_score_file = config[data_path]['row_score_file']
    row_score = dataset.load_score_file(row_score_file)

    test_data_size = dataset.data_size(test_src_file)
    logger.info('test size: {}'.format(test_data_size))
    if vocab_type == 'normal':
        src_vocab = dataset.VocabNormal()
        src_vocab.load(model_dir + 'src_vocab.normal.pkl')
        src_vocab.set_reverse_vocab()
        trg_vocab = dataset.VocabNormal()
        trg_vocab.load(model_dir + 'trg_vocab.normal.pkl')
        trg_vocab.set_reverse_vocab()

        sos = convert.convert_list(
            np.array([src_vocab.vocab['<s>']], dtype=np.int32), gpu_id)
        eos = convert.convert_list(
            np.array([src_vocab.vocab['</s>']], dtype=np.int32), gpu_id)

    elif vocab_type == 'subword':
        src_vocab = dataset.VocabSubword()
        src_vocab.load(model_dir + 'src_vocab.sub.model')
        trg_vocab = dataset.VocabSubword()
        trg_vocab.load(model_dir + 'trg_vocab.sub.model')

        sos = convert.convert_list(
            np.array([src_vocab.vocab.PieceToId('<s>')], dtype=np.int32),
            gpu_id)
        eos = convert.convert_list(
            np.array([src_vocab.vocab.PieceToId('</s>')], dtype=np.int32),
            gpu_id)

    src_vocab_size = len(src_vocab.vocab)
    trg_vocab_size = len(trg_vocab.vocab)
    logger.info('src_vocab size: {}, trg_vocab size: {}'.format(
        src_vocab_size, trg_vocab_size))

    test_iter = dataset.Iterator(test_src_file,
                                 test_src_file,
                                 src_vocab,
                                 trg_vocab,
                                 batch_size,
                                 sort=False,
                                 shuffle=False)

    gridsearcher = gridsearch.GridSearch(test_src_file)
    """MODEL"""
    if model_type == 'multi':
        model = model_reg.Multi(src_vocab_size, trg_vocab_size, embed_size,
                                hidden_size, class_size, dropout_ratio,
                                coefficient)
    elif model_type in ['label', 'pretrain']:
        model = model_reg.Label(src_vocab_size, trg_vocab_size, embed_size,
                                hidden_size, class_size, dropout_ratio)
    else:
        model = model_reg.EncoderDecoder(src_vocab_size, trg_vocab_size,
                                         embed_size, hidden_size,
                                         dropout_ratio)

    chainer.serializers.load_npz(model_file, model)
    """GPU"""
    if gpu_id >= 0:
        logger.info('Use GPU')
        chainer.cuda.get_device_from_id(gpu_id).use()
        model.to_gpu()
    """TEST"""
    epoch = 'T'
    outputs = []
    labels = []
    alignments = []
    for i, batch in enumerate(test_iter.generate(), start=1):
        with chainer.no_backprop_mode(), chainer.using_config('train', False):
            output, label, align = model.predict(batch[0], sos, eos)
        for o in output:
            outputs.append(trg_vocab.id2word(chainer.cuda.to_cpu(o)))
        for l in label:
            labels.append(chainer.cuda.to_cpu(l))
        for a in align:
            alignments.append(chainer.cuda.to_cpu(a))

    model_file = model_file[:-3]
    if model_type == 'multi':
        score = gridsearcher.gridsearch(labels, alignments)
        logger.info('E{} ## {}'.format(epoch, score[0]))
        logger.info('E{} ## {}'.format(epoch, score[1]))
        with open(model_file + 'label.T', 'w') as f:
            [f.write('{}\n'.format(l)) for l in labels]
        with open(model_file + '.hypo.T', 'w') as f:
            [f.write(o + '\n') for o in outputs]
        with open(model_file + '.align.T', 'w') as f:
            [f.write('{}\n'.format(a)) for a in alignments]

    elif model_type in ['label', 'pretrain']:
        score = gridsearcher.gridsearch(labels, alignments)
        logger.info('E{} ## {}'.format(epoch, score[0]))
        logger.info('E{} ## {}'.format(epoch, score[1]))
        with open(model_file + 'label.T', 'w') as f:
            [f.write('{}\n'.format(l)) for l in labels]

    else:
        score = gridsearcher.gridsearch(row_score, alignments)
        logger.info('E{} ## {}'.format(epoch, score[0]))
        logger.info('E{} ## {}'.format(epoch, score[1]))
        with open(model_file + '.hypo.T', 'w') as f:
            [f.write(o + '\n') for o in outputs]
        with open(model_file + '.align.T', 'w') as f:
            [f.write('{}\n'.format(a)) for a in alignments]