Exemple #1
0
def train(args):
    """
    trains the reading comprehension model
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)

    # logger.info('Assigning embeddings...')
    # vocab.embed_dim = args.embed_size
    # vocab.load_pretrained_embeddings(args.embedding_path)

    logger.info('Vocabulary %s' % vocab.size())

    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          vocab, args.train_files, args.dev_files)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Initialize the model...')
    rc_model = RCModel(vocab, args)
    # rc_model = MTRCModel(vocab, args)
    logger.info('Training the model...')
    rc_model.train(brc_data,
                   args.epochs,
                   args.batch_size,
                   save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('Done with model training!')
Exemple #2
0
def evaluate(args):
    """
    对训练好的模型进行验证
    """
    logger = logging.getLogger("brc")
    logger.info('加wudi...')
    logger.info('加载数据集和词汇表...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    assert len(args.dev_files) > 0, '找不到验证文件.'
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          dev_files=args.dev_files)
    logger.info('把文本转化为id序列...')
    brc_data.convert_to_ids(vocab)
    logger.info('重载模型...')
    rc_model = RCModel(vocab, args)
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('验证模型...')
    dev_batches = brc_data.gen_mini_batches('dev',
                                            args.batch_size,
                                            pad_id=vocab.get_id(
                                                vocab.pad_token),
                                            shuffle=False)
    dev_loss, dev_bleu_rouge = rc_model.evaluate(dev_batches,
                                                 result_dir=args.result_dir,
                                                 result_prefix='dev.predicted')
    logger.info('验证集上的损失为: {}'.format(dev_loss))
    logger.info('验证集的结果: {}'.format(dev_bleu_rouge))
    logger.info('预测的答案证保存到 {}'.format(os.path.join(args.result_dir)))
Exemple #3
0
def predict(args):
    """
    预测测试文件的答案
    """
    logger = logging.getLogger("brc")
    logger.info('加载数据集和词汇表...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    assert len(args.test_files) > 0, '找不到测试文件.'
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          test_files=args.test_files)
    logger.info('把文本转化为id序列...')
    brc_data.convert_to_ids(vocab)
    logger.info('重载模型...')
    rc_model = RCModel(vocab, args)
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('预测测试集的答案...')
    test_batches = brc_data.gen_mini_batches('test',
                                             args.batch_size,
                                             pad_id=vocab.get_id(
                                                 vocab.pad_token),
                                             shuffle=False)
    rc_model.evaluate(test_batches,
                      result_dir=args.result_dir,
                      result_prefix='test.predicted')
Exemple #4
0
def train(args):
    logger = logging.getLogger("QAPointNet")
    logger.info("====== training ======")

    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)

    dataloader = BRCDataset(args.max_p_num,args.max_p_len, args.max_q_len,args.save_dir,
                            args.train_files, args.dev_files)
    num_train_steps = int(
        len(dataloader.train_set) / args.batch_size * args.epochs)
    num_warmup_steps = int(num_train_steps * args.warmup_proportion)
    logger.info('Converting text into ids...')
    dataloader.convert_to_ids(vocab)

    logger.info('Initialize the model...')
    model = RCModel(vocab, num_train_steps, num_warmup_steps, args)
    del vocab

    logger.info('Training the model...')
    model.train(dataloader, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo,
                dropout=args.dropout)

    logger.info('====== Done with model training! ======')
Exemple #5
0
def predict(args):
    """
    predicts answers for test files
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
    with open(args.vocab_path, 'rb') as fin:
        vocab = pickle.load(fin)
    assert len(args.test_files) > 0, 'No test files are provided.'
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          test_files=args.test_files)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Restoring the model...')
    rc_model = RCModel(vocab, args)
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('Predicting answers for test set...')
    test_batches = brc_data.gen_mini_batches('test',
                                             args.batch_size,
                                             pad_id=vocab.get_id(
                                                 vocab.pad_token),
                                             shuffle=False)
    rc_model.evaluate(test_batches,
                      result_dir=args.result_dir,
                      result_prefix='test.predicted')
Exemple #6
0
def evaluate(args):
    logger = logging.getLogger("QAPointNet")
    logger.info("====== evaluating ======")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)

    assert len(args.dev_files) > 0, 'No dev files are provided.'
    dataloader = BRCDataset(args.max_p_num,args.max_p_len, args.max_q_len,args.save_dir, dev_files=args.dev_files)

    num_train_steps = int(
        len(dataloader.train_set) / args.batch_size * args.epochs)
    num_warmup_steps = int(num_train_steps * args.warmup_proportion)
    logger.info('Converting text into ids...')
    dataloader.convert_to_ids(vocab)

    logger.info('Restoring the model...')
    model = RCModel(vocab, num_train_steps,num_warmup_steps,args)
    model.restore(args.model_dir, 'BIDAF_42000')
    logger.info('Evaluating the model on dev set...')
    dev_batches = dataloader.gen_mini_batches('dev', 64, vocab.get_word_id(vocab.pad_token),shuffle=False)

    dev_loss, dev_bleu_rouge = model.evaluate(
        dev_batches, result_dir=args.result_dir, result_prefix='dev.predicted')

    logger.info('Loss on dev set: {}'.format(dev_loss))
    logger.info('Result on dev set: {}'.format(dev_bleu_rouge))
    logger.info('Predicted answers are saved to {}'.format(os.path.join(args.result_dir)))
Exemple #7
0
def evaluate(args):
    """
    evaluate the trained model on dev files
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
    with open(args.vocab_path, 'rb') as fin:
        vocab = pickle.load(fin)
    assert len(args.dev_files) > 0, 'No dev files are provided.'
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          dev_files=args.dev_files)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Restoring the model...')
    rc_model = RCModel(vocab, args)
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('Evaluating the model on dev set...')
    dev_batches = brc_data.gen_mini_batches('dev',
                                            args.batch_size,
                                            pad_id=vocab.get_id(
                                                vocab.pad_token),
                                            shuffle=False)
    dev_loss, dev_bleu_rouge = rc_model.evaluate(dev_batches,
                                                 result_dir=args.result_dir,
                                                 result_prefix='dev.predicted')
    logger.info('Loss on dev set: {}'.format(dev_loss))
    logger.info('Result on dev set: {}'.format(dev_bleu_rouge))
    logger.info('Predicted answers are saved to {}'.format(
        os.path.join(args.result_dir)))
Exemple #8
0
def train(args):
    """
    训练阅读理解模型
    """
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger = logging.getLogger("brc")

    file_handler = logging.FileHandler(args.log_path)
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

    logger.info(args)

    logger.info('加载数据集和词汇表...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    brc_data = BRCDataset(args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files)
    logger.info('词语转化为id序列...')
    brc_data.convert_to_ids(vocab)
    logger.info('初始化模型...')
    rc_model = RCModel(vocab, args)
    logger.info('训练模型...')
    rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('训练完成!')
Exemple #9
0
def predict(args):
    logger = logging.getLogger("QAPointNet")

    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)

    assert len(args.test_files) > 0, 'No test files are provided.'
    dataloader = BRCDataset(args.max_p_num,
                            args.max_p_len,
                            args.max_q_len,
                            args.save_dir,
                            test_files=args.test_files)
    num_train_steps = int(
        len(dataloader.train_set) / args.batch_size * args.epochs)
    num_warmup_steps = int(num_train_steps * args.warmup_proportion)
    logger.info('Converting text into ids...')
    dataloader.convert_to_ids(vocab)
    logger.info('Restoring the model...')

    model = RCModel(vocab, num_train_steps, num_warmup_steps, args)
    model.restore(args.model_dir, 'BIDAF_18000')
    logger.info('Predicting answers for test set...')
    test_batches = dataloader.gen_mini_batches('test',
                                               64,
                                               vocab.get_word_id(
                                                   vocab.pad_token),
                                               shuffle=False)

    model.evaluate(test_batches,
                   result_dir=args.result_dir,
                   result_prefix='test.predicted')
Exemple #10
0
def evaluate(args):
    """
       predicts answers for test files
       """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)

#  assert len(args.test_files) > 0, 'No test files are provided.'

# brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.max_train_sample_num,args.test_files, use_type="test")
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          args.max_train_sample_num,
                          args.dev_files,
                          use_type="dev")

    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)

    rc_model = S_netModel(vocab, args)
    logger.info('Restoring the model...')
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('evaluate answers for dev set...')
    test_batches = brc_data.gen_mini_batches('dev',
                                             args.batch_size,
                                             pad_id=vocab.get_id(
                                                 vocab.pad_token),
                                             shuffle=False)
    #rc_model.predict(test_batches,result_dir=args.result_dir, result_prefix=args.result_prefix)
    rc_model.evaluate(test_batches,
                      result_dir=args.result_dir,
                      result_prefix=args.result_prefix)
Exemple #11
0
def train(args):
    """
    trains the reading comprehension model
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
    with open(args.vocab_path, 'rb') as fin:
        vocab = pickle.load(fin)
    brc_data = BRCDataset(args.algo, args.max_p_num, args.max_p_len,
                          args.max_q_len, args.max_a_len, args.train_files,
                          args.dev_files)
    logger.info('Converting text into ids...')

    brc_data.convert_to_ids(vocab)

    logger.info('Initialize the model...')
    rc_model = RCModel(vocab, args)
    if args.restore:
        logger.info('Restoring the model...')
        rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('Training the model...')
    rc_model.train(brc_data,
                   args.epochs,
                   args.batch_size,
                   save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('Done with model training!')
Exemple #12
0
def train(args):
    """
    trains the reading comprehension model
    """
    logger = logging.getLogger("brc")
    # 加载数据集 和 辞典(prepare保存的)
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin) # pickle python的标准模块 --prepare运行时vocab的对象信息读取
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files) # 最大 文章数,文章长度,问题长度,
                                                            # train时候只有训练文件,验证文件
    # 利用vocab 把brc_data 转换 成 id
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab) # 把原始数据的问题和文章的单词转换成辞典保存的id
    # 初始化神经网络
    logger.info('Initialize the model...')
    rc_model = RCModel(vocab, args)
    logger.info('Training the model...')
    """
    Train the model with data
    Args:
        data: the BRCDataset class implemented in dataset.py
        epochs: number of training epochs
        batch_size:
        save_dir: the directory to save the model
        save_prefix: the prefix indicating the model type
        dropout_keep_prob: float value indicating dropout keep probability
        evaluate: whether to evaluate the model on test set after each epoch
    """
    rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('Done with model training!')
Exemple #13
0
def train(args):
    """
    trains the reading comprehension model
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    # 加载 vocab对象 ,包括 token2id id2token 以及其它方法
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files)
    # brc_data.save_set_file(brc_data.dev_set, './save_sets', 'dev_set')
    # brc_data.save_set_file(brc_data.test_set, './save_sets', 'test_set')
    # brc_data.save_set_file(brc_data.train_set, './save_sets', 'train_set')
    logger.info('Converting text into ids...')
    # [self.train_set, self.dev_set, self.test_set] 原始数据 转为id形式
    brc_data.convert_to_ids(vocab)
    logger.info('Initialize the model...')
    rc_model = RCModel(vocab, args)
    # 加载上次保存的模型
    # rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    # ****************************************************************
    logger.info('Training the model...')
    rc_model.train(brc_data,
                   args.epochs,
                   args.batch_size,
                   save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('Done with model training!')
Exemple #14
0
def train(args):
    """
    trains the reading comprehension model
    """
    logger = logging.getLogger("brc")
    logger.info('Loading vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.pkl'), 'rb') as fin:
        vocab = pickle.load(fin)
    fin.close()
    pad_id = vocab.get_id(vocab.pad_token)
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.prepared_dir, args.train_files, args.dev_files,
                          args.test_files)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    g = tf.Graph()
    with g.as_default():
        rc_model = RCModel(vocab.embeddings, pad_id, args)
        del vocab
        # Train
        with tf.name_scope("Train"):
            logger.info('Training the model...')
            rc_model.train(brc_data,
                           args.epochs,
                           args.batch_size,
                           save_dir=args.result_dir,
                           save_prefix='test.predicted',
                           dropout_keep_prob=args.dropout_keep_prob)
        tf.summary.FileWriter(args.summary_dir, g).close()
        with tf.name_scope('Valid'):
            assert len(args.dev_files) > 0, 'No dev files are provided.'
            logger.info('Evaluating the model on dev set...')
            dev_batches = brc_data.gen_mini_batches('dev',
                                                    args.batch_size,
                                                    pad_id=pad_id,
                                                    shuffle=False)
            dev_loss, dev_bleu_rouge = rc_model.evaluate(
                dev_batches,
                result_dir=args.result_dir,
                result_prefix='dev.predicted')
            logger.info('Loss on dev set: {}'.format(dev_loss))
            logger.info('Result on dev set: {}'.format(dev_bleu_rouge))
            logger.info('Predicted answers are saved to {}'.format(
                os.path.join(args.result_dir)))
        with tf.name_scope('Test'):
            assert len(args.test_files) > 0, 'No test files are provided.'
            logger.info('Predicting answers for test set...')
            test_batches = brc_data.gen_mini_batches('test',
                                                     args.batch_size,
                                                     pad_id=pad_id,
                                                     shuffle=False)
            rc_model.evaluate(test_batches,
                              result_dir=args.result_dir,
                              result_prefix='test.predicted')
Exemple #15
0
def evaluate(logger, args):
    """evaluate a specific model using devset"""
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
        logger.info('vocab size is {} and embed dim is {}'.format(
            vocab.size(), vocab.embed_dim))
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          dev_files=args.devset)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Initialize the model...')

    # build model
    main_program = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(main_program, startup_prog):
        with fluid.unique_name.guard():
            avg_cost, s_probs, e_probs, match, feed_order = rc_model.rc_model(
                args.hidden_size, vocab, args)
            # initialize parameters
            if not args.use_gpu:
                place = fluid.CPUPlace()
                dev_count = int(
                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
            else:
                place = fluid.CUDAPlace(0)
                dev_count = fluid.core.get_cuda_device_count()

            exe = Executor(place)
            if args.load_dir:
                logger.info('load from {}'.format(args.load_dir))
                fluid.io.load_persistables(exe,
                                           args.load_dir,
                                           main_program=main_program)
            else:
                logger.error('No model file to load ...')
                return

            inference_program = main_program.clone(for_test=True)
            eval_loss, bleu_rouge = validation(inference_program, avg_cost,
                                               s_probs, e_probs, match,
                                               feed_order, place, dev_count,
                                               vocab, brc_data, logger, args)
            logger.info('Dev eval loss {}'.format(eval_loss))
            logger.info('Dev eval result: {}'.format(bleu_rouge))
            logger.info('Predicted answers are saved to {}'.format(
                os.path.join(args.result_dir)))
Exemple #16
0
def train(args):
    """
    trains the reading comprehension model
    """
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files)
    brc_data.convert_to_ids(vocab)
    rc_model = RCModel(vocab, args)
    rc_model.train(brc_data,
                   args.epochs,
                   args.batch_size,
                   save_dir=args.model_dir,
                   save_prefix=args.algo)
Exemple #17
0
def train(args):
    logger = logging.getLogger("rc")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        data_vocabs = pickle.load(fin)
    args.pos_size = data_vocabs.pos_vocab.size()
    args.ner_size = data_vocabs.ner_vocab.size()
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_file, args.dev_file, args.test_file)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(data_vocabs)
    logger.info('Saving the args')
    pickle.dump(args, open(args.args_file, 'wb'))
    logger.info('Initialize the model...')
    rc_model = DrqaModel(data_vocabs.word_vocab, args)
    logger.info('Training the model...')
    rc_model.train(brc_data)
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """

    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)
    logger.info('Preparing the directories...')
    for dir_path in [
            args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir
    ]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    # logger.info('Assigning embeddings...')
    # vocab.randomly_init_embeddings(args.embed_size)          #random init in prepare!!

    #save the datasets to records files.
    logger.info('Saving the datasets.')
    brc_data.convert_to_ids(vocab)
    pad_id = vocab.get_id(vocab.pad_token)
    brc_data.save_records(pad_id)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Exemple #19
0
def train(args):
    """
    训练阅读理解模型
    """
    logger = logging.getLogger("brc")
    logger.info('加载数据集和词汇表...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files)
    logger.info('词语转化为id序列...')
    brc_data.convert_to_ids(vocab)
    logger.info('初始化模型...')
    rc_model = RCModel(vocab, args)
    logger.info('训练模型...')
    rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('训练完成!')
Exemple #20
0
def train(args):
    """
    trains the reading comprehension model
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Initialize the model...')
    rc_model = RCModel(vocab, args)
    logger.info('Training the model...')
    rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('Done with model training!')
Exemple #21
0
def evaluate(args):
    """
    evaluate the trained model on dev files
    """
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    assert len(args.dev_files) > 0, 'No dev files are provided.'
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          dev_files=args.dev_files)
    brc_data.convert_to_ids(vocab)
    rc_model = RCModel(vocab, args)
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo + '_7')
    dev_batches = brc_data.gen_mini_batches('dev',
                                            args.batch_size,
                                            pad_id=vocab.get_id(
                                                vocab.pad_token),
                                            shuffle=False)
    bleu_rouge = rc_model.evaluate(dev_batches)
Exemple #22
0
def predict(args):
    """
    predicts answers for test files
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    assert len(args.test_files) > 0, 'No test files are provided.'
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          test_files=args.test_files)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Restoring the model...')
    rc_model = RCModel(vocab, args)
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('Predicting answers for test set...')
    test_batches = brc_data.gen_mini_batches('test', args.batch_size,
                                             pad_id=vocab.get_id(vocab.pad_token), shuffle=False)
    rc_model.evaluate(test_batches,
                      result_dir=args.result_dir, result_prefix='test.predicted')
Exemple #23
0
def train(args):
    """
    trains the reading comprehension model
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    if args.word2vec_path:
        logger.info('learn_word_embedding:{}'.format(args.learn_word_embedding))
        logger.info('loadding %s \n' % args.word2vec_path)
        word2vec = gensim.models.Word2Vec.load(args.word2vec_path)
        vocab.load_pretrained_embeddings_from_w2v(word2vec.wv)
        logger.info('load pretrained embedding from %s done\n' % args.word2vec_path)

    if args.use_char_embed:
        with open(os.path.join(args.vocab_dir, 'char_vocab.data'), 'rb') as fin:
            char_vocab = pickle.load(fin)

    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files)
    steps_per_epoch = brc_data.size('train') // args.batch_size
    args.decay_steps = args.decay_epochs * steps_per_epoch 
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    
    if args.use_char_embed:
        logger.info('Converting text into char ids...')
        brc_data.convert_to_char_ids(char_vocab)
        logger.info('Binding char_vocab to args to pass to RCModel')
        args.char_vocab = char_vocab

    RCModel = choose_model_by_gpu_setting(args)
    logger.info('Initialize the model...')
    rc_model = RCModel(vocab, args)
    logger.info('Training the model...{}'.format(RCModel.__name__))
    rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('Done with model training!')
Exemple #24
0
def predict(args):
    """
    predicts answers for test files
    """
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    assert len(args.test_files) > 0, 'No test files are provided.'
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          test_files=args.test_files)
    brc_data.convert_to_ids(vocab)
    rc_model = RCModel(vocab, args)
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    test_batches = brc_data.gen_mini_batches('test',
                                             args.batch_size,
                                             pad_id=vocab.get_id(
                                                 vocab.pad_token),
                                             shuffle=False)
    rc_model.evaluate(test_batches,
                      result_dir=args.result_dir,
                      result_prefix='test.predicted')
Exemple #25
0
def evaluate(args):
    """
    evaluate the trained model on dev files
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    assert len(args.dev_files) > 0, 'No dev files are provided.'
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, dev_files=args.dev_files)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Restoring the model...')
    rc_model = RCModel(vocab, args)
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('Evaluating the model on dev set...')
    dev_batches = brc_data.gen_mini_batches('dev', args.batch_size,
                                            pad_id=vocab.get_id(vocab.pad_token), shuffle=False)
    dev_loss, dev_bleu_rouge = rc_model.evaluate(
        dev_batches, result_dir=args.result_dir, result_prefix='dev.predicted')
    logger.info('Loss on dev set: {}'.format(dev_loss))
    logger.info('Result on dev set: {}'.format(dev_bleu_rouge))
    logger.info('Predicted answers are saved to {}'.format(os.path.join(args.result_dir)))
Exemple #26
0
def predict(args):
    """
    predicts answers for test files
    """
    logger = logging.getLogger("rc")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        data_vocabs = pickle.load(fin)
    assert args.test_file, 'No test files are provided.'
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          test_file=args.test_file)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(data_vocabs)
    logger.info('Restoring the model...')

    args = pickle.load(open(args.args_file, 'rb'))
    args.pos_size = data_vocabs.pos_vocab.size()
    args.ner_size = data_vocabs.ner_vocab.size()

    rc_model = DrqaModel(data_vocabs.word_vocab, args, eva=True)
    rc_model.evaluate(brc_data)
Exemple #27
0
def predict(args):
    """
    predicts answers for test files
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    assert len(args.test_files) > 0, 'No test files are provided.'
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          args.max_word_len,
                          test_files=args.test_files)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Restoring the model...')
    rc_model = RCModel(vocab, args)
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    rc_model.finalize()
    # 增加完所有操作后采用sess.graph.finalize()
    # 来使得整个graph变为只读的
    # 注意:tf.train.Saver()
    # 也算是往graph中添加node, 所以也必须放在finilize前
    # 但是,,tf.train.Saver()
    # 只会存储
    # 在该Saver声明时已经存在的变量!!!
    logger.info('Predicting answers for test set...')
    test_batches = brc_data.gen_mini_batches('test',
                                             args.batch_size,
                                             pad_id=vocab.get_id(
                                                 vocab.pad_token),
                                             shuffle=False)
    rc_model.evaluate(test_batches,
                      result_dir=args.result_dir,
                      result_prefix='test.predicted')
Exemple #28
0
def predict(args):
    """
    predicts answers for test files
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    with open(args.vocab_path, 'rb') as fin:
        vocab = pickle.load(fin)
    assert len(args.test_files) > 0, 'No test files are provided.'
    brc_data = BRCDataset(args.algo,
                          args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          args.max_a_len,
                          test_files=args.test_files)
    logger.info('Converting text into ids...')

    brc_data.convert_to_ids(vocab)

    logger.info('Restoring the model...')
    rc_model = RCModel(vocab, args)
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('Predicting answers for test set...')
    test_batches = brc_data.gen_mini_batches('test',
                                             args.batch_size,
                                             pad_id=vocab.get_id(
                                                 vocab.pad_token),
                                             shuffle=False)
    if args.algo == 'YESNO':
        qa_resultPath = args.test_files[0]  #只会有一个文件!
        (filepath, tempfilename) = os.path.split(qa_resultPath)
        (qarst_filename, extension) = os.path.splitext(tempfilename)
        result_prefix = qarst_filename
    else:
        result_prefix = 'test.predicted.qa'

    rc_model.evaluate(test_batches,
                      result_dir=args.result_dir,
                      result_prefix=result_prefix)
    if args.algo == 'YESNO':  #将YESNO结果合并入QA结果
        qa_resultPath = args.test_files[0]  #只会有一个文件!
        yesno_resultPath = args.result_dir + '/' + result_prefix + '.YESNO.json'
        out_file_path = args.result_dir + '/' + result_prefix + '.134.class.' + str(
            args.run_id) + '.json'

        #首先载入YESNO部分的预测结果
        yesno_records = {}
        with open(yesno_resultPath, 'r') as f_in:
            for line in f_in:
                sample = json.loads(line)
                yesno_records[sample['question_id']] = line

        total_rst_num = 0
        with open(qa_resultPath, 'r') as f_in:
            with open(out_file_path, 'w') as f_out:
                for line in f_in:
                    total_rst_num += 1
                    sample = json.loads(line)
                    if sample['question_id'] in yesno_records:
                        line = yesno_records[sample['question_id']]
                    f_out.write(line)

        print('total rst num : ', total_rst_num)
        print('yes no label combining done!')
Exemple #29
0
def train(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)
    logger.info('Preparing the directories...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')

    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.max_train_sample_num, args.train_files)

    vocab = Vocab(lower=True)

    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)

    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    logger.info('Assigning embeddings...')
    vocab.load_pretrained_embeddings(args.word_embedding_path)

    #vocab.randomly_init_embeddings(300)
    #vocab1.randomly_init_embeddings(300)
    logger.info('Saving vocab...')
    brc_data.convert_to_ids(vocab)
    logger.info('Initialize the model...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    rc_model = S_netModel(vocab, args)
    logger.info('Training the model...')
    #rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo +'sys')
    #if args.train_as:
    #    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo + 'syst')
    rc_model.train(brc_data,
                   args.epochs,
                   args.batch_size,
                   save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('Done with model training!')

    logger.info('evaluate the trained model!')
    test_batches = brc_data.gen_mini_batches('test',
                                             args.batch_size,
                                             pad_id=vocab.get_id(
                                                 vocab.pad_token),
                                             shuffle=False)
    rc_model.evaluate(test_batches,
                      result_dir=args.result_dir,
                      result_prefix='test.predicted')
    logger.info('Done with model evaluating !')
Exemple #30
0
def train(logger, args):
    """train a model"""
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        if six.PY2:
            vocab = pickle.load(fin)
        else:
            vocab = pickle.load(fin, encoding='bytes')
        logger.info('vocab size is {} and embed dim is {}'.format(
            vocab.size(), vocab.embed_dim))
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.trainset, args.devset)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Initialize the model...')

    if not args.use_gpu:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    else:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()

    # build model
    main_program = fluid.Program()
    startup_prog = fluid.Program()
    if args.enable_ce:
        main_program.random_seed = args.random_seed
        startup_prog.random_seed = args.random_seed
    with fluid.program_guard(main_program, startup_prog):
        with fluid.unique_name.guard():
            avg_cost, s_probs, e_probs, match, feed_order = rc_model.rc_model(
                args.hidden_size, vocab, args)
            # clone from default main program and use it as the validation program
            inference_program = main_program.clone(for_test=True)

            # build optimizer
            if args.optim == 'sgd':
                optimizer = fluid.optimizer.SGD(
                    learning_rate=args.learning_rate)
            elif args.optim == 'adam':
                optimizer = fluid.optimizer.Adam(
                    learning_rate=args.learning_rate)
            elif args.optim == 'rprop':
                optimizer = fluid.optimizer.RMSPropOptimizer(
                    learning_rate=args.learning_rate)
            else:
                logger.error('Unsupported optimizer: {}'.format(args.optim))
                exit(-1)
            if args.weight_decay > 0.0:
                obj_func = avg_cost + args.weight_decay * l2_loss(main_program)
                optimizer.minimize(obj_func)
            else:
                obj_func = avg_cost
                optimizer.minimize(obj_func)

            # initialize parameters
            place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
            exe = Executor(place)
            if args.load_dir:
                logger.info('load from {}'.format(args.load_dir))
                fluid.io.load_persistables(exe,
                                           args.load_dir,
                                           main_program=main_program)
            else:
                exe.run(startup_prog)
                embedding_para = fluid.global_scope().find_var(
                    'embedding_para').get_tensor()
                embedding_para.set(vocab.embeddings.astype(np.float32), place)

            # prepare data
            feed_list = [
                main_program.global_block().var(var_name)
                for var_name in feed_order
            ]
            feeder = fluid.DataFeeder(feed_list, place)

            logger.info('Training the model...')
            parallel_executor = fluid.ParallelExecutor(
                main_program=main_program,
                use_cuda=bool(args.use_gpu),
                loss_name=avg_cost.name)
            print_para(main_program, parallel_executor, logger, args)

            for pass_id in range(1, args.pass_num + 1):
                pass_start_time = time.time()
                pad_id = vocab.get_id(vocab.pad_token)
                if args.enable_ce:
                    train_reader = lambda: brc_data.gen_mini_batches(
                        'train', args.batch_size, pad_id, shuffle=False)
                else:
                    train_reader = lambda: brc_data.gen_mini_batches(
                        'train', args.batch_size, pad_id, shuffle=True)
                train_reader = read_multiple(train_reader, dev_count)
                log_every_n_batch, n_batch_loss = args.log_interval, 0
                total_num, total_loss = 0, 0
                for batch_id, batch_list in enumerate(train_reader(), 1):
                    feed_data = batch_reader(batch_list, args)
                    fetch_outs = parallel_executor.run(
                        feed=list(feeder.feed_parallel(feed_data, dev_count)),
                        fetch_list=[obj_func.name],
                        return_numpy=False)
                    cost_train = np.array(fetch_outs[0]).mean()
                    total_num += args.batch_size * dev_count
                    n_batch_loss += cost_train
                    total_loss += cost_train * args.batch_size * dev_count

                    if args.enable_ce and batch_id >= 100:
                        break
                    if log_every_n_batch > 0 and batch_id % log_every_n_batch == 0:
                        print_para(main_program, parallel_executor, logger,
                                   args)
                        logger.info(
                            'Average loss from batch {} to {} is {}'.format(
                                batch_id - log_every_n_batch + 1, batch_id,
                                "%.10f" % (n_batch_loss / log_every_n_batch)))
                        n_batch_loss = 0
                    if args.dev_interval > 0 and batch_id % args.dev_interval == 0:
                        if brc_data.dev_set is not None:
                            eval_loss, bleu_rouge = validation(
                                inference_program, avg_cost, s_probs, e_probs,
                                match, feed_order, place, dev_count, vocab,
                                brc_data, logger, args)
                            logger.info('Dev eval loss {}'.format(eval_loss))
                            logger.info(
                                'Dev eval result: {}'.format(bleu_rouge))
                pass_end_time = time.time()
                time_consumed = pass_end_time - pass_start_time
                logger.info('epoch: {0}, epoch_time_cost: {1:.2f}'.format(
                    pass_id, time_consumed))
                logger.info(
                    'Evaluating the model after epoch {}'.format(pass_id))
                if brc_data.dev_set is not None:
                    eval_loss, bleu_rouge = validation(inference_program,
                                                       avg_cost, s_probs,
                                                       e_probs, match,
                                                       feed_order, place,
                                                       dev_count, vocab,
                                                       brc_data, logger, args)
                    logger.info('Dev eval loss {}'.format(eval_loss))
                    logger.info('Dev eval result: {}'.format(bleu_rouge))
                else:
                    logger.warning(
                        'No dev set is loaded for evaluation in the dataset!')

                logger.info('Average train loss for epoch {} is {}'.format(
                    pass_id, "%.10f" % (1.0 * total_loss / total_num)))

                if pass_id % args.save_interval == 0:
                    model_path = os.path.join(args.save_dir, str(pass_id))
                    if not os.path.isdir(model_path):
                        os.makedirs(model_path)

                    fluid.io.save_persistables(executor=exe,
                                               dirname=model_path,
                                               main_program=main_program)
                if args.enable_ce:  # For CE
                    print("kpis\ttrain_cost_card%d\t%f" %
                          (dev_count, total_loss / total_num))
                    if brc_data.dev_set is not None:
                        print("kpis\ttest_cost_card%d\t%f" %
                              (dev_count, eval_loss))
                    print("kpis\ttrain_duration_card%d\t%f" %
                          (dev_count, time_consumed))
Exemple #31
0
def train(args):
    """
    trains the reading comprehension model
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Initialize the model...')
    rc_model = RCModel(vocab, args)
    logger.info('Training the model...')

    ####重新加载模型进行训练
    file_pre = args.model_dir
    with tf.Session() as sess:
        saver = tf.train.Saver()
        try:
            saver.restore(sess, file_pre + config['model_name'])
        except:
            pass

        def train_epoch(train_batches, dropout_keep_prob):
            total_num, total_loss = 0, 0
            log_every_n_batch, n_batch_loss = 50, 0
            for bitx, batch in enumerate(train_batches, 1):
                feed_dict = {
                    rc_model.p: batch['passage_token_ids'],
                    rc_model.q: batch['question_token_ids'],
                    rc_model.p_length: batch['passage_length'],
                    rc_model.q_length: batch['question_length'],
                    rc_model.start_label: batch['start_id'],
                    rc_model.end_label: batch['end_id'],
                    rc_model.dropout_keep_prob: dropout_keep_prob
                }
                _, loss = rc_model.sess.run([rc_model.train_op, rc_model.loss],
                                            feed_dict)
                total_loss += loss * len(batch['raw_data'])
                total_num += len(batch['raw_data'])
                n_batch_loss += loss
                if log_every_n_batch > 0 and bitx % log_every_n_batch == 0:
                    rc_model.logger.info(
                        'Average loss from batch {} to {} is {}'.format(
                            bitx - log_every_n_batch + 1, bitx,
                            n_batch_loss / log_every_n_batch))
                    n_batch_loss = 0
            return 1.0 * total_loss / total_num

        def train(self,
                  data,
                  epochs,
                  batch_size,
                  file_pre,
                  config,
                  dropout_keep_prob=1.0,
                  evaluate=True):
            pad_id = self.vocab.get_id(self.vocab.pad_token)
            max_bleu_4 = 0
            for epoch in range(1, epochs):
                rc_model.logger.info(
                    'Training the model for epoch {}'.format(epoch))
                train_batches = data.gen_mini_batches('train',
                                                      batch_size,
                                                      pad_id,
                                                      shuffle=True)
                train_loss = train_epoch(train_batches, dropout_keep_prob)
                rc_model.logger.info(
                    'Average train loss for epoch {} is {}'.format(
                        epoch, train_loss))

                if evaluate:
                    logger.info(
                        'Evaluating the model after epoch {}'.format(epoch))
                    if data.dev_set is not None:
                        eval_batches = data.gen_mini_batches('dev',
                                                             batch_size,
                                                             pad_id,
                                                             shuffle=False)
                        eval_loss, bleu_rouge = rc_model.evaluate(eval_batches)
                        logger.info('Dev eval loss {}'.format(eval_loss))
                        rc_model.logger.info(
                            'Dev eval result: {}'.format(bleu_rouge))

                        if bleu_rouge['Bleu-4'] > max_bleu_4:
                            rc_model.save(file_pre + config['model_name'])
                            max_bleu_4 = bleu_rouge['Bleu-4']
                    else:
                        rc_model.logger.warning(
                            'No dev set is loaded for evaluation in the dataset!'
                        )
                else:
                    # self.save(save_dir, save_prefix + '_' + str(epoch))不保存每一个轮次的校验结果
                    rc_model.save(file_pre + config['model_name'])

        train(brc_data,
              args.epochs,
              args.batch_size,
              file_pre,
              config,
              dropout_keep_prob=args.dropout_keep_prob)
        logger.info('Done with model training!')