Ejemplo n.º 1
0
def train(args):
    """
    trains the reading comprehension model
    """
    logger = logging.getLogger("brc")
    # 加载数据集 和 辞典(prepare保存的)
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin) # pickle python的标准模块 --prepare运行时vocab的对象信息读取
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files) # 最大 文章数,文章长度,问题长度,
                                                            # train时候只有训练文件,验证文件
    # 利用vocab 把brc_data 转换 成 id
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab) # 把原始数据的问题和文章的单词转换成辞典保存的id
    # 初始化神经网络
    logger.info('Initialize the model...')
    rc_model = RCModel(vocab, args)
    logger.info('Training the model...')
    """
    Train the model with data
    Args:
        data: the BRCDataset class implemented in dataset.py
        epochs: number of training epochs
        batch_size:
        save_dir: the directory to save the model
        save_prefix: the prefix indicating the model type
        dropout_keep_prob: float value indicating dropout keep probability
        evaluate: whether to evaluate the model on test set after each epoch
    """
    rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('Done with model training!')
Ejemplo n.º 2
0
def evaluate(args):
    """
       predicts answers for test files
       """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)

#  assert len(args.test_files) > 0, 'No test files are provided.'

# brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.max_train_sample_num,args.test_files, use_type="test")
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          args.max_train_sample_num,
                          args.dev_files,
                          use_type="dev")

    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)

    rc_model = S_netModel(vocab, args)
    logger.info('Restoring the model...')
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('evaluate answers for dev set...')
    test_batches = brc_data.gen_mini_batches('dev',
                                             args.batch_size,
                                             pad_id=vocab.get_id(
                                                 vocab.pad_token),
                                             shuffle=False)
    #rc_model.predict(test_batches,result_dir=args.result_dir, result_prefix=args.result_prefix)
    rc_model.evaluate(test_batches,
                      result_dir=args.result_dir,
                      result_prefix=args.result_prefix)
Ejemplo n.º 3
0
def train(args):
    """
    trains the reading comprehension model
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)

    # logger.info('Assigning embeddings...')
    # vocab.embed_dim = args.embed_size
    # vocab.load_pretrained_embeddings(args.embedding_path)

    logger.info('Vocabulary %s' % vocab.size())

    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          vocab, args.train_files, args.dev_files)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Initialize the model...')
    rc_model = RCModel(vocab, args)
    # rc_model = MTRCModel(vocab, args)
    logger.info('Training the model...')
    rc_model.train(brc_data,
                   args.epochs,
                   args.batch_size,
                   save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('Done with model training!')
Ejemplo n.º 4
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)
    logger.info('Preparing the directories...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,
                                                                            vocab.size()))

    logger.info('Assigning embeddings...')
    vocab.randomly_init_embeddings(args.embed_size)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Ejemplo n.º 5
0
Archivo: run.py Proyecto: Yaozeng/MRC
def train(args):
    logger = logging.getLogger("QAPointNet")
    logger.info("====== training ======")

    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)

    dataloader = BRCDataset(args.max_p_num,args.max_p_len, args.max_q_len,args.save_dir,
                            args.train_files, args.dev_files)
    num_train_steps = int(
        len(dataloader.train_set) / args.batch_size * args.epochs)
    num_warmup_steps = int(num_train_steps * args.warmup_proportion)
    logger.info('Converting text into ids...')
    dataloader.convert_to_ids(vocab)

    logger.info('Initialize the model...')
    model = RCModel(vocab, num_train_steps, num_warmup_steps, args)
    del vocab

    logger.info('Training the model...')
    model.train(dataloader, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo,
                dropout=args.dropout)

    logger.info('====== Done with model training! ======')
Ejemplo n.º 6
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)

    for dir_path in [args.vocab_dir, args.model_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    # unfiltered_vocab_size = vocab.size()
    print("vocab size is ", vocab.size())
    vocab.filter_tokens_by_cnt(min_cnt=2)
    print("after filtered vocab size is ", vocab.size())
    # filtered_num = unfiltered_vocab_size - vocab.size()

    vocab.randomly_init_embeddings(args.embed_size)
    if args.use_pre_train:
        vocab.load_pretrained_embeddings(args.pre_train_file)

    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)
Ejemplo n.º 7
0
def prepare(args):
    logger = logging.getLogger("rc")
    logger.info('train test split...')
    train_test_split(args.all_file, args.train_file, args.test_file,
                     args.train_rate)
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_file, args.dev_file, args.test_file)
    data_vocabs = DataVocabs()
    for word, pos, ner in brc_data.word_iter('train'):
        data_vocabs.word_vocab.add(word)
        data_vocabs.pos_vocab.add(pos)
        data_vocabs.ner_vocab.add(ner)
    unfiltered_vocab_size = data_vocabs.word_vocab.size()
    data_vocabs.word_vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - data_vocabs.word_vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, data_vocabs.word_vocab.size()))
    logger.info('Assigning embeddings...')
    # vocab.randomly_init_embeddings(args.embed_size)
    data_vocabs.word_vocab.load_pretrained_embeddings(args.embedding_path)
    logger.info('embedding size: {}, {}'.format(
        len(data_vocabs.word_vocab.embeddings),
        len(data_vocabs.word_vocab.embeddings[0])))
    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(data_vocabs, fout)
    logger.info('Done with preparing!')
Ejemplo n.º 8
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("Cail")
    logger.info('Checking the data files...')
    print("Checking the data files...")
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)

    logger.info('Preparing the directories...')

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files) # 创建这个类

    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):  # 构建词典只包含训练集
        vocab.add(word)
    logger.info("Tokens num {}".format(vocab.size()))
    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=5) # 过滤低频token
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,
                                                                            vocab.size()))

    logger.info('Assigning embeddings...')
    vocab.randomly_init_embeddings(args.embed_size) # 随机初始化的embedding 第一步
    # vocab.load_pretrained_embeddings(args.embedding_path)  # glove pre-trained 第二步 提取vocab
    # logger.info("Vocab size is {} from embedding".format(vocab.size()))
    logger.info('Saving vocab...')
    with open(args.vocab_path, 'wb') as fout:  # vocab存入
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Ejemplo n.º 9
0
def predict(args):
    """
    预测测试文件的答案
    """
    logger = logging.getLogger("brc")
    logger.info('加载数据集和词汇表...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    assert len(args.test_files) > 0, '找不到测试文件.'
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          test_files=args.test_files)
    logger.info('把文本转化为id序列...')
    brc_data.convert_to_ids(vocab)
    logger.info('重载模型...')
    rc_model = RCModel(vocab, args)
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('预测测试集的答案...')
    test_batches = brc_data.gen_mini_batches('test',
                                             args.batch_size,
                                             pad_id=vocab.get_id(
                                                 vocab.pad_token),
                                             shuffle=False)
    rc_model.evaluate(test_batches,
                      result_dir=args.result_dir,
                      result_prefix='test.predicted')
Ejemplo n.º 10
0
def train(args):
    """
    trains the reading comprehension model
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    # 加载 vocab对象 ,包括 token2id id2token 以及其它方法
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files)
    # brc_data.save_set_file(brc_data.dev_set, './save_sets', 'dev_set')
    # brc_data.save_set_file(brc_data.test_set, './save_sets', 'test_set')
    # brc_data.save_set_file(brc_data.train_set, './save_sets', 'train_set')
    logger.info('Converting text into ids...')
    # [self.train_set, self.dev_set, self.test_set] 原始数据 转为id形式
    brc_data.convert_to_ids(vocab)
    logger.info('Initialize the model...')
    rc_model = RCModel(vocab, args)
    # 加载上次保存的模型
    # rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    # ****************************************************************
    logger.info('Training the model...')
    rc_model.train(brc_data,
                   args.epochs,
                   args.batch_size,
                   save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('Done with model training!')
Ejemplo n.º 11
0
def train(args):
    """
    训练阅读理解模型
    """
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger = logging.getLogger("brc")

    file_handler = logging.FileHandler(args.log_path)
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)

    logger.info(args)

    logger.info('加载数据集和词汇表...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    brc_data = BRCDataset(args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files)
    logger.info('词语转化为id序列...')
    brc_data.convert_to_ids(vocab)
    logger.info('初始化模型...')
    rc_model = RCModel(vocab, args)
    logger.info('训练模型...')
    rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('训练完成!')
Ejemplo n.º 12
0
def evaluate(args):
    """
    对训练好的模型进行验证
    """
    logger = logging.getLogger("brc")
    logger.info('加wudi...')
    logger.info('加载数据集和词汇表...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    assert len(args.dev_files) > 0, '找不到验证文件.'
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          dev_files=args.dev_files)
    logger.info('把文本转化为id序列...')
    brc_data.convert_to_ids(vocab)
    logger.info('重载模型...')
    rc_model = RCModel(vocab, args)
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('验证模型...')
    dev_batches = brc_data.gen_mini_batches('dev',
                                            args.batch_size,
                                            pad_id=vocab.get_id(
                                                vocab.pad_token),
                                            shuffle=False)
    dev_loss, dev_bleu_rouge = rc_model.evaluate(dev_batches,
                                                 result_dir=args.result_dir,
                                                 result_prefix='dev.predicted')
    logger.info('验证集上的损失为: {}'.format(dev_loss))
    logger.info('验证集的结果: {}'.format(dev_bleu_rouge))
    logger.info('预测的答案证保存到 {}'.format(os.path.join(args.result_dir)))
Ejemplo n.º 13
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)

    logger.info('Preparing the directories...')

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):  #构建词典只包含训练集
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    logger.info('Assigning embeddings...')
    # vocab.randomly_init_embeddings(args.embed_size)#TODO-load_pretrained_embeddings
    vocab.load_pretrained_embeddings(args.embedding_path)  #glove pre-trained

    logger.info('Saving vocab...')
    # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
    with open(args.vocab_path, 'wb') as fout:  #不区分search&zhidao
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Ejemplo n.º 14
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)
    logger.info('Preparing the directories...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,
                                                                            vocab.size()))

    logger.info('Assigning embeddings...')
    vocab.randomly_init_embeddings(args.embed_size)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Ejemplo n.º 15
0
def evaluate(args):
    """
    evaluate the trained model on dev files
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
    with open(args.vocab_path, 'rb') as fin:
        vocab = pickle.load(fin)
    assert len(args.dev_files) > 0, 'No dev files are provided.'
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          dev_files=args.dev_files)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Restoring the model...')
    rc_model = RCModel(vocab, args)
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('Evaluating the model on dev set...')
    dev_batches = brc_data.gen_mini_batches('dev',
                                            args.batch_size,
                                            pad_id=vocab.get_id(
                                                vocab.pad_token),
                                            shuffle=False)
    dev_loss, dev_bleu_rouge = rc_model.evaluate(dev_batches,
                                                 result_dir=args.result_dir,
                                                 result_prefix='dev.predicted')
    logger.info('Loss on dev set: {}'.format(dev_loss))
    logger.info('Result on dev set: {}'.format(dev_bleu_rouge))
    logger.info('Predicted answers are saved to {}'.format(
        os.path.join(args.result_dir)))
Ejemplo n.º 16
0
Archivo: run.py Proyecto: Yaozeng/MRC
def prepro(args):
    logger = logging.getLogger("QAPointNet")
    logger.info("====== preprocessing ======")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)

    logger.info('Preparing the directories...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    dataloader = BRCDataset(args.max_p_num,args.max_p_len, args.max_q_len,args.save_dir,
                            args.train_files, args.dev_files, args.test_files, prepare=True)

    vocab = Vocab(lower=True)
    for word in dataloader.word_iter('train'):
        vocab.add_word(word)
    del dataloader
    unfiltered_vocab_size = vocab.word_size()
    vocab.filter_words_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.word_size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,vocab.word_size()))

    logger.info('Assigning embeddings...')
    if args.pretrained_word_path is not None:
        vocab.load_pretrained_word_embeddings(args.pretrained_word_path)
    else:
        vocab.randomly_init_word_embeddings(args.embed_size)
    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('====== Done with preparing! ======')
Ejemplo n.º 17
0
def prepare(args):
    """
    检查数据,创建目录,准备词汇表和词嵌入
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('检查数据文件...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} 文件不存在.'.format(data_path)
    logger.info('建立目录...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('创建词汇表...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('过滤掉 {} 个词语, 最终的词汇量是 {}'.format(filtered_num,
                                                vocab.size()))

    logger.info('指定词向量...')
    vocab.randomly_init_embeddings(args.embed_size)

    logger.info('保存词汇表...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('完成预备过程!')
Ejemplo n.º 18
0
def predict(args):
    """
    predicts answers for test files
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
    with open(args.vocab_path, 'rb') as fin:
        vocab = pickle.load(fin)
    assert len(args.test_files) > 0, 'No test files are provided.'
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          test_files=args.test_files)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Restoring the model...')
    rc_model = RCModel(vocab, args)
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('Predicting answers for test set...')
    test_batches = brc_data.gen_mini_batches('test',
                                             args.batch_size,
                                             pad_id=vocab.get_id(
                                                 vocab.pad_token),
                                             shuffle=False)
    rc_model.evaluate(test_batches,
                      result_dir=args.result_dir,
                      result_prefix='test.predicted')
Ejemplo n.º 19
0
def predict(args):
    logger = logging.getLogger("QAPointNet")

    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)

    assert len(args.test_files) > 0, 'No test files are provided.'
    dataloader = BRCDataset(args.max_p_num,
                            args.max_p_len,
                            args.max_q_len,
                            args.save_dir,
                            test_files=args.test_files)
    num_train_steps = int(
        len(dataloader.train_set) / args.batch_size * args.epochs)
    num_warmup_steps = int(num_train_steps * args.warmup_proportion)
    logger.info('Converting text into ids...')
    dataloader.convert_to_ids(vocab)
    logger.info('Restoring the model...')

    model = RCModel(vocab, num_train_steps, num_warmup_steps, args)
    model.restore(args.model_dir, 'BIDAF_18000')
    logger.info('Predicting answers for test set...')
    test_batches = dataloader.gen_mini_batches('test',
                                               64,
                                               vocab.get_word_id(
                                                   vocab.pad_token),
                                               shuffle=False)

    model.evaluate(test_batches,
                   result_dir=args.result_dir,
                   result_prefix='test.predicted')
Ejemplo n.º 20
0
Archivo: run.py Proyecto: Yaozeng/MRC
def evaluate(args):
    logger = logging.getLogger("QAPointNet")
    logger.info("====== evaluating ======")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)

    assert len(args.dev_files) > 0, 'No dev files are provided.'
    dataloader = BRCDataset(args.max_p_num,args.max_p_len, args.max_q_len,args.save_dir, dev_files=args.dev_files)

    num_train_steps = int(
        len(dataloader.train_set) / args.batch_size * args.epochs)
    num_warmup_steps = int(num_train_steps * args.warmup_proportion)
    logger.info('Converting text into ids...')
    dataloader.convert_to_ids(vocab)

    logger.info('Restoring the model...')
    model = RCModel(vocab, num_train_steps,num_warmup_steps,args)
    model.restore(args.model_dir, 'BIDAF_42000')
    logger.info('Evaluating the model on dev set...')
    dev_batches = dataloader.gen_mini_batches('dev', 64, vocab.get_word_id(vocab.pad_token),shuffle=False)

    dev_loss, dev_bleu_rouge = model.evaluate(
        dev_batches, result_dir=args.result_dir, result_prefix='dev.predicted')

    logger.info('Loss on dev set: {}'.format(dev_loss))
    logger.info('Result on dev set: {}'.format(dev_bleu_rouge))
    logger.info('Predicted answers are saved to {}'.format(os.path.join(args.result_dir)))
Ejemplo n.º 21
0
def train(args):
    """
    trains the reading comprehension model
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
    with open(args.vocab_path, 'rb') as fin:
        vocab = pickle.load(fin)
    brc_data = BRCDataset(args.algo, args.max_p_num, args.max_p_len,
                          args.max_q_len, args.max_a_len, args.train_files,
                          args.dev_files)
    logger.info('Converting text into ids...')

    brc_data.convert_to_ids(vocab)

    logger.info('Initialize the model...')
    rc_model = RCModel(vocab, args)
    if args.restore:
        logger.info('Restoring the model...')
        rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('Training the model...')
    rc_model.train(brc_data,
                   args.epochs,
                   args.batch_size,
                   save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('Done with model training!')
Ejemplo n.º 22
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)
    logger.info('Preparing the directories...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    vocab = Vocab(lower=True)
    if args.use_char_embed:
        char_vocab = Vocab(lower=True)
    fout = open(os.path.join(args.vocab_dir, 'word.txt'), 'w')
    for word in brc_data.word_iter('train'):
        if word == 'mosi':
            print('xxxxxxxxxx:%s\n' % word)
        vocab.add(word)
        if args.use_char_embed:
            for char in list(word):
                char_vocab.add(char)
        fout.write(word + ', ' + ' '.join(list(word)) + '\n')
    fout.close()
    idx = vocab.get_id('mosi')
    print('yyyyyyyy:mosi_idx:%d\n' % idx)


    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=args.min_cnt)
    logger.info('min_cnt = %d ' % args.min_cnt)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,
                                                                            vocab.size()))
    
    #logger.info('The final char vocab size is {}'.format(char_vocab.size()))

    logger.info('Assigning embeddings...')
    vocab.randomly_init_embeddings(args.embed_size)
    if args.use_char_embed:
        char_vocab.randomly_init_embeddings(args.char_embed_size)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)
    
    if args.use_char_embed:
        logger.info('Saving char vocab...')
        with open(os.path.join(args.vocab_dir, 'char_vocab.data'), 'wb') as fout:
            pickle.dump(char_vocab, fout)

    logger.info('Done with preparing!')
Ejemplo n.º 23
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Building vocabulary...')
    # 载入数据
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          args.train_files,
                          args.dev_files,
                          args.test_files,
                          prepare=True)
    # 构建词典
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    logger.info('Assigning embeddings...')
    # 1.随机初始化词向量 2.载入预训练词向量
    if not args.pretrain:
        # 保留至少出现2次的token
        vocab.filter_tokens_by_cnt(min_cnt=2)
        vocab.randomly_init_embeddings(args.embed_size)
    else:
        pre_train(brc_data, args.segmented_dir)
        vocab.load_pretrained_embeddings(
            os.path.join(args.segmented_dir, 'w2v_dic.data'))
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.pkl'), 'wb') as fout:
        pickle.dump(vocab, fout)
    fout.close()

    logger.info('Saving sets...')
    with open(os.path.join(args.prepared_dir, 'train_set.pkl'),
              'wb') as f_train_out:
        pickle.dump(brc_data.train_set, f_train_out)
    f_train_out.close()
    with open(os.path.join(args.prepared_dir, 'dev_set.pkl'),
              'wb') as f_dev_out:
        pickle.dump(brc_data.dev_set, f_dev_out)
    f_dev_out.close()
    with open(os.path.join(args.prepared_dir, 'test_set.pkl'),
              'wb') as f_test_out:
        pickle.dump(brc_data.test_set, f_test_out)
    f_test_out.close()

    logger.info('Done with preparing!')
Ejemplo n.º 24
0
def prepare(logger, args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger.info('Checking the data files...')
    for data_path in args.trainset + args.devset + args.testset:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)
    logger.info('Preparing the directories...')
    """
    args.vocab_dir:
        ../data/vocab, help="vocabulary".
    
    args.save_dir:
        ../data/models, help="Specify the path to save trained models".
    
    args.result_dir:
        ../data/results/, help="the dir to output the results".
    
    """
    for dir_path in [args.vocab_dir, args.save_dir, args.result_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    """
    max_p_num=5
    max_p_len=500
    max_q_len=60
    """
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.trainset, args.devset, args.testset)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    logger.info('Assigning embeddings...')
    # vocab.randomly_init_embeddings(args.embed_size)  # embed_size (default) = 300
    vocab.get_vector(args.embed_size)  # embed_size (default) = 300

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Ejemplo n.º 25
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    # 确保路径存在
    # train_files  dev_files  test_files
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)

    # 准备保存数据的目录,如果目录不存在,则创建
    # vocab_dir model_dir result_dir summary_dir
    logger.info('Preparing the directories...')
    for dir_path in [
            args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir
    ]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    # 准备数据
    # 传入最大p数量 最大p长度 最大q长度  以及 训练 开发 测试文件目录
    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files, args.test_files)
    # 创建字典
    vocab = Vocab(lower=True)
    # 得到所有的字词,创建 token2id id2token等
    for word in brc_data.word_iter('train'):
        vocab.add(word)
    # 未过滤之前的大小
    unfiltered_vocab_size = vocab.size()
    # 过滤词频低于2的词
    vocab.filter_tokens_by_cnt(min_cnt=2)
    # 过滤的数量 = 未过滤前的数量 - 过滤后的数量
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))
    #
    logger.info('Assigning embeddings...')
    # 随机初始化词向量 size * 300
    # 5006 * 300
    vocab.randomly_init_embeddings(args.embed_size)

    logger.info('Saving vocab...')
    # 保存词汇表
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)

    logger.info('Done with preparing!')
Ejemplo n.º 26
0
def evaluate(logger, args):
    """evaluate a specific model using devset"""
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
        logger.info('vocab size is {} and embed dim is {}'.format(
            vocab.size(), vocab.embed_dim))
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          dev_files=args.devset)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Initialize the model...')

    # build model
    main_program = fluid.Program()
    startup_prog = fluid.Program()
    with fluid.program_guard(main_program, startup_prog):
        with fluid.unique_name.guard():
            avg_cost, s_probs, e_probs, match, feed_order = rc_model.rc_model(
                args.hidden_size, vocab, args)
            # initialize parameters
            if not args.use_gpu:
                place = fluid.CPUPlace()
                dev_count = int(
                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
            else:
                place = fluid.CUDAPlace(0)
                dev_count = fluid.core.get_cuda_device_count()

            exe = Executor(place)
            if args.load_dir:
                logger.info('load from {}'.format(args.load_dir))
                fluid.io.load_persistables(exe,
                                           args.load_dir,
                                           main_program=main_program)
            else:
                logger.error('No model file to load ...')
                return

            inference_program = main_program.clone(for_test=True)
            eval_loss, bleu_rouge = validation(inference_program, avg_cost,
                                               s_probs, e_probs, match,
                                               feed_order, place, dev_count,
                                               vocab, brc_data, logger, args)
            logger.info('Dev eval loss {}'.format(eval_loss))
            logger.info('Dev eval result: {}'.format(bleu_rouge))
            logger.info('Predicted answers are saved to {}'.format(
                os.path.join(args.result_dir)))
Ejemplo n.º 27
0
def calculate_unk(train_files, target_files):
    brc_data = BRCDataset(5, 500, 60, train_files, target_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)
    vocab.filter_tokens_by_cnt(min_cnt=2)
    overlap_num = 0
    dev_vocab = set()
    for word in brc_data.word_iter('dev'):
        dev_vocab.add(word)
    for word in dev_vocab:
        if word in vocab.token2id:
            overlap_num += 1
    print("over lap word is {} in {}".format(overlap_num, len(dev_vocab)))
Ejemplo n.º 28
0
def train(args):
    """
    trains the reading comprehension model
    """
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files)
    brc_data.convert_to_ids(vocab)
    rc_model = RCModel(vocab, args)
    rc_model.train(brc_data,
                   args.epochs,
                   args.batch_size,
                   save_dir=args.model_dir,
                   save_prefix=args.algo)
Ejemplo n.º 29
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Building SIF Model...')
    sif_model = SIFModel(args, logger, True, args.train_files, args.dev_files,
                         args.test_files)
    logger.info('Training word embeddings...')
    sif_model.train_embeddings()
    logger.info('Building pc and sif embeddings...')
    sif_model.build_pc_and_sif_embedding_list()
    sif_model.load_model()
    logger.info('Building vocabulary...')
    # 构建词典
    vocab = Vocab(lower=True)
    # 载入预训练词向量
    vocab.load_pretrained_embeddings(
        os.path.join(args.prepared_dir, 'w2v_dic.pkl'))
    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.pkl'), 'wb') as fvocab:
        pickle.dump(vocab, fvocab)
    fvocab.close()
    # 构建数据集
    logger.info('Loading dataset...')
    brc_data = BRCDataset(args.max_p_num,
                          args.max_p_len,
                          args.max_q_len,
                          args.prepared_dir,
                          args.train_files,
                          args.dev_files,
                          args.test_files,
                          prepare=True)

    logger.info('Done with preparing!')
Ejemplo n.º 30
0
def train(args):
    logger = logging.getLogger("rc")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        data_vocabs = pickle.load(fin)
    args.pos_size = data_vocabs.pos_vocab.size()
    args.ner_size = data_vocabs.ner_vocab.size()
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_file, args.dev_file, args.test_file)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(data_vocabs)
    logger.info('Saving the args')
    pickle.dump(args, open(args.args_file, 'wb'))
    logger.info('Initialize the model...')
    rc_model = DrqaModel(data_vocabs.word_vocab, args)
    logger.info('Training the model...')
    rc_model.train(brc_data)
Ejemplo n.º 31
0
def train(args):
    """
    训练阅读理解模型
    """
    logger = logging.getLogger("brc")
    logger.info('加载数据集和词汇表...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files)
    logger.info('词语转化为id序列...')
    brc_data.convert_to_ids(vocab)
    logger.info('初始化模型...')
    rc_model = RCModel(vocab, args)
    logger.info('训练模型...')
    rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('训练完成!')
Ejemplo n.º 32
0
def train(args):
    """
    trains the reading comprehension model
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.train_files, args.dev_files)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Initialize the model...')
    rc_model = RCModel(vocab, args)
    logger.info('Training the model...')
    rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir,
                   save_prefix=args.algo,
                   dropout_keep_prob=args.dropout_keep_prob)
    logger.info('Done with model training!')
Ejemplo n.º 33
0
def predict(args):
    """
    predicts answers for test files
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    assert len(args.test_files) > 0, 'No test files are provided.'
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          test_files=args.test_files)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Restoring the model...')
    rc_model = RCModel(vocab, args)
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('Predicting answers for test set...')
    test_batches = brc_data.gen_mini_batches('test', args.batch_size,
                                             pad_id=vocab.get_id(vocab.pad_token), shuffle=False)
    rc_model.evaluate(test_batches,
                      result_dir=args.result_dir, result_prefix='test.predicted')
Ejemplo n.º 34
0
def evaluate(args):
    """
    evaluate the trained model on dev files
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        vocab = pickle.load(fin)
    assert len(args.dev_files) > 0, 'No dev files are provided.'
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, dev_files=args.dev_files)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Restoring the model...')
    rc_model = RCModel(vocab, args)
    rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo)
    logger.info('Evaluating the model on dev set...')
    dev_batches = brc_data.gen_mini_batches('dev', args.batch_size,
                                            pad_id=vocab.get_id(vocab.pad_token), shuffle=False)
    dev_loss, dev_bleu_rouge = rc_model.evaluate(
        dev_batches, result_dir=args.result_dir, result_prefix='dev.predicted')
    logger.info('Loss on dev set: {}'.format(dev_loss))
    logger.info('Result on dev set: {}'.format(dev_bleu_rouge))
    logger.info('Predicted answers are saved to {}'.format(os.path.join(args.result_dir)))