def train(args): """ trains the reading comprehension model """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) # logger.info('Assigning embeddings...') # vocab.embed_dim = args.embed_size # vocab.load_pretrained_embeddings(args.embedding_path) logger.info('Vocabulary %s' % vocab.size()) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, vocab, args.train_files, args.dev_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Initialize the model...') rc_model = RCModel(vocab, args) # rc_model = MTRCModel(vocab, args) logger.info('Training the model...') rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout_keep_prob=args.dropout_keep_prob) logger.info('Done with model training!')
def evaluate(args): """ 对训练好的模型进行验证 """ logger = logging.getLogger("brc") logger.info('加wudi...') logger.info('加载数据集和词汇表...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) assert len(args.dev_files) > 0, '找不到验证文件.' brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, dev_files=args.dev_files) logger.info('把文本转化为id序列...') brc_data.convert_to_ids(vocab) logger.info('重载模型...') rc_model = RCModel(vocab, args) rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('验证模型...') dev_batches = brc_data.gen_mini_batches('dev', args.batch_size, pad_id=vocab.get_id( vocab.pad_token), shuffle=False) dev_loss, dev_bleu_rouge = rc_model.evaluate(dev_batches, result_dir=args.result_dir, result_prefix='dev.predicted') logger.info('验证集上的损失为: {}'.format(dev_loss)) logger.info('验证集的结果: {}'.format(dev_bleu_rouge)) logger.info('预测的答案证保存到 {}'.format(os.path.join(args.result_dir)))
def predict(args): """ 预测测试文件的答案 """ logger = logging.getLogger("brc") logger.info('加载数据集和词汇表...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) assert len(args.test_files) > 0, '找不到测试文件.' brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, test_files=args.test_files) logger.info('把文本转化为id序列...') brc_data.convert_to_ids(vocab) logger.info('重载模型...') rc_model = RCModel(vocab, args) rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('预测测试集的答案...') test_batches = brc_data.gen_mini_batches('test', args.batch_size, pad_id=vocab.get_id( vocab.pad_token), shuffle=False) rc_model.evaluate(test_batches, result_dir=args.result_dir, result_prefix='test.predicted')
def train(args): logger = logging.getLogger("QAPointNet") logger.info("====== training ======") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) dataloader = BRCDataset(args.max_p_num,args.max_p_len, args.max_q_len,args.save_dir, args.train_files, args.dev_files) num_train_steps = int( len(dataloader.train_set) / args.batch_size * args.epochs) num_warmup_steps = int(num_train_steps * args.warmup_proportion) logger.info('Converting text into ids...') dataloader.convert_to_ids(vocab) logger.info('Initialize the model...') model = RCModel(vocab, num_train_steps, num_warmup_steps, args) del vocab logger.info('Training the model...') model.train(dataloader, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout=args.dropout) logger.info('====== Done with model training! ======')
def predict(args): """ predicts answers for test files """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: with open(args.vocab_path, 'rb') as fin: vocab = pickle.load(fin) assert len(args.test_files) > 0, 'No test files are provided.' brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, test_files=args.test_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Restoring the model...') rc_model = RCModel(vocab, args) rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('Predicting answers for test set...') test_batches = brc_data.gen_mini_batches('test', args.batch_size, pad_id=vocab.get_id( vocab.pad_token), shuffle=False) rc_model.evaluate(test_batches, result_dir=args.result_dir, result_prefix='test.predicted')
def evaluate(args): logger = logging.getLogger("QAPointNet") logger.info("====== evaluating ======") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) assert len(args.dev_files) > 0, 'No dev files are provided.' dataloader = BRCDataset(args.max_p_num,args.max_p_len, args.max_q_len,args.save_dir, dev_files=args.dev_files) num_train_steps = int( len(dataloader.train_set) / args.batch_size * args.epochs) num_warmup_steps = int(num_train_steps * args.warmup_proportion) logger.info('Converting text into ids...') dataloader.convert_to_ids(vocab) logger.info('Restoring the model...') model = RCModel(vocab, num_train_steps,num_warmup_steps,args) model.restore(args.model_dir, 'BIDAF_42000') logger.info('Evaluating the model on dev set...') dev_batches = dataloader.gen_mini_batches('dev', 64, vocab.get_word_id(vocab.pad_token),shuffle=False) dev_loss, dev_bleu_rouge = model.evaluate( dev_batches, result_dir=args.result_dir, result_prefix='dev.predicted') logger.info('Loss on dev set: {}'.format(dev_loss)) logger.info('Result on dev set: {}'.format(dev_bleu_rouge)) logger.info('Predicted answers are saved to {}'.format(os.path.join(args.result_dir)))
def evaluate(args): """ evaluate the trained model on dev files """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: with open(args.vocab_path, 'rb') as fin: vocab = pickle.load(fin) assert len(args.dev_files) > 0, 'No dev files are provided.' brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, dev_files=args.dev_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Restoring the model...') rc_model = RCModel(vocab, args) rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('Evaluating the model on dev set...') dev_batches = brc_data.gen_mini_batches('dev', args.batch_size, pad_id=vocab.get_id( vocab.pad_token), shuffle=False) dev_loss, dev_bleu_rouge = rc_model.evaluate(dev_batches, result_dir=args.result_dir, result_prefix='dev.predicted') logger.info('Loss on dev set: {}'.format(dev_loss)) logger.info('Result on dev set: {}'.format(dev_bleu_rouge)) logger.info('Predicted answers are saved to {}'.format( os.path.join(args.result_dir)))
def train(args): """ 训练阅读理解模型 """ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger("brc") file_handler = logging.FileHandler(args.log_path) file_handler.setLevel(logging.INFO) file_handler.setFormatter(formatter) logger.addHandler(file_handler) console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter(formatter) logger.addHandler(console_handler) logger.info(args) logger.info('加载数据集和词汇表...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) brc_data = BRCDataset(args.max_p_len, args.max_q_len, args.train_files, args.dev_files) logger.info('词语转化为id序列...') brc_data.convert_to_ids(vocab) logger.info('初始化模型...') rc_model = RCModel(vocab, args) logger.info('训练模型...') rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout_keep_prob=args.dropout_keep_prob) logger.info('训练完成!')
def predict(args): logger = logging.getLogger("QAPointNet") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) assert len(args.test_files) > 0, 'No test files are provided.' dataloader = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.save_dir, test_files=args.test_files) num_train_steps = int( len(dataloader.train_set) / args.batch_size * args.epochs) num_warmup_steps = int(num_train_steps * args.warmup_proportion) logger.info('Converting text into ids...') dataloader.convert_to_ids(vocab) logger.info('Restoring the model...') model = RCModel(vocab, num_train_steps, num_warmup_steps, args) model.restore(args.model_dir, 'BIDAF_18000') logger.info('Predicting answers for test set...') test_batches = dataloader.gen_mini_batches('test', 64, vocab.get_word_id( vocab.pad_token), shuffle=False) model.evaluate(test_batches, result_dir=args.result_dir, result_prefix='test.predicted')
def evaluate(args): """ predicts answers for test files """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) # assert len(args.test_files) > 0, 'No test files are provided.' # brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.max_train_sample_num,args.test_files, use_type="test") brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.max_train_sample_num, args.dev_files, use_type="dev") logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) rc_model = S_netModel(vocab, args) logger.info('Restoring the model...') rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('evaluate answers for dev set...') test_batches = brc_data.gen_mini_batches('dev', args.batch_size, pad_id=vocab.get_id( vocab.pad_token), shuffle=False) #rc_model.predict(test_batches,result_dir=args.result_dir, result_prefix=args.result_prefix) rc_model.evaluate(test_batches, result_dir=args.result_dir, result_prefix=args.result_prefix)
def train(args): """ trains the reading comprehension model """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: with open(args.vocab_path, 'rb') as fin: vocab = pickle.load(fin) brc_data = BRCDataset(args.algo, args.max_p_num, args.max_p_len, args.max_q_len, args.max_a_len, args.train_files, args.dev_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Initialize the model...') rc_model = RCModel(vocab, args) if args.restore: logger.info('Restoring the model...') rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('Training the model...') rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout_keep_prob=args.dropout_keep_prob) logger.info('Done with model training!')
def train(args): """ trains the reading comprehension model """ logger = logging.getLogger("brc") # 加载数据集 和 辞典(prepare保存的) logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) # pickle python的标准模块 --prepare运行时vocab的对象信息读取 brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files) # 最大 文章数,文章长度,问题长度, # train时候只有训练文件,验证文件 # 利用vocab 把brc_data 转换 成 id logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) # 把原始数据的问题和文章的单词转换成辞典保存的id # 初始化神经网络 logger.info('Initialize the model...') rc_model = RCModel(vocab, args) logger.info('Training the model...') """ Train the model with data Args: data: the BRCDataset class implemented in dataset.py epochs: number of training epochs batch_size: save_dir: the directory to save the model save_prefix: the prefix indicating the model type dropout_keep_prob: float value indicating dropout keep probability evaluate: whether to evaluate the model on test set after each epoch """ rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout_keep_prob=args.dropout_keep_prob) logger.info('Done with model training!')
def train(args): """ trains the reading comprehension model """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') # 加载 vocab对象 ,包括 token2id id2token 以及其它方法 with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files) # brc_data.save_set_file(brc_data.dev_set, './save_sets', 'dev_set') # brc_data.save_set_file(brc_data.test_set, './save_sets', 'test_set') # brc_data.save_set_file(brc_data.train_set, './save_sets', 'train_set') logger.info('Converting text into ids...') # [self.train_set, self.dev_set, self.test_set] 原始数据 转为id形式 brc_data.convert_to_ids(vocab) logger.info('Initialize the model...') rc_model = RCModel(vocab, args) # 加载上次保存的模型 # rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) # **************************************************************** logger.info('Training the model...') rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout_keep_prob=args.dropout_keep_prob) logger.info('Done with model training!')
def train(args): """ trains the reading comprehension model """ logger = logging.getLogger("brc") logger.info('Loading vocab...') with open(os.path.join(args.vocab_dir, 'vocab.pkl'), 'rb') as fin: vocab = pickle.load(fin) fin.close() pad_id = vocab.get_id(vocab.pad_token) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.prepared_dir, args.train_files, args.dev_files, args.test_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) g = tf.Graph() with g.as_default(): rc_model = RCModel(vocab.embeddings, pad_id, args) del vocab # Train with tf.name_scope("Train"): logger.info('Training the model...') rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.result_dir, save_prefix='test.predicted', dropout_keep_prob=args.dropout_keep_prob) tf.summary.FileWriter(args.summary_dir, g).close() with tf.name_scope('Valid'): assert len(args.dev_files) > 0, 'No dev files are provided.' logger.info('Evaluating the model on dev set...') dev_batches = brc_data.gen_mini_batches('dev', args.batch_size, pad_id=pad_id, shuffle=False) dev_loss, dev_bleu_rouge = rc_model.evaluate( dev_batches, result_dir=args.result_dir, result_prefix='dev.predicted') logger.info('Loss on dev set: {}'.format(dev_loss)) logger.info('Result on dev set: {}'.format(dev_bleu_rouge)) logger.info('Predicted answers are saved to {}'.format( os.path.join(args.result_dir))) with tf.name_scope('Test'): assert len(args.test_files) > 0, 'No test files are provided.' logger.info('Predicting answers for test set...') test_batches = brc_data.gen_mini_batches('test', args.batch_size, pad_id=pad_id, shuffle=False) rc_model.evaluate(test_batches, result_dir=args.result_dir, result_prefix='test.predicted')
def evaluate(logger, args): """evaluate a specific model using devset""" logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) logger.info('vocab size is {} and embed dim is {}'.format( vocab.size(), vocab.embed_dim)) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, dev_files=args.devset) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Initialize the model...') # build model main_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(main_program, startup_prog): with fluid.unique_name.guard(): avg_cost, s_probs, e_probs, match, feed_order = rc_model.rc_model( args.hidden_size, vocab, args) # initialize parameters if not args.use_gpu: place = fluid.CPUPlace() dev_count = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() exe = Executor(place) if args.load_dir: logger.info('load from {}'.format(args.load_dir)) fluid.io.load_persistables(exe, args.load_dir, main_program=main_program) else: logger.error('No model file to load ...') return inference_program = main_program.clone(for_test=True) eval_loss, bleu_rouge = validation(inference_program, avg_cost, s_probs, e_probs, match, feed_order, place, dev_count, vocab, brc_data, logger, args) logger.info('Dev eval loss {}'.format(eval_loss)) logger.info('Dev eval result: {}'.format(bleu_rouge)) logger.info('Predicted answers are saved to {}'.format( os.path.join(args.result_dir)))
def train(args): """ trains the reading comprehension model """ with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files) brc_data.convert_to_ids(vocab) rc_model = RCModel(vocab, args) rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo)
def train(args): logger = logging.getLogger("rc") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: data_vocabs = pickle.load(fin) args.pos_size = data_vocabs.pos_vocab.size() args.ner_size = data_vocabs.ner_vocab.size() brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_file, args.dev_file, args.test_file) logger.info('Converting text into ids...') brc_data.convert_to_ids(data_vocabs) logger.info('Saving the args') pickle.dump(args, open(args.args_file, 'wb')) logger.info('Initialize the model...') rc_model = DrqaModel(data_vocabs.word_vocab, args) logger.info('Training the model...') rc_model.train(brc_data)
def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger("brc") logger.info('Checking the data files...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format( data_path) logger.info('Preparing the directories...') for dir_path in [ args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir ]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('Building vocabulary...') brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files, args.test_files) vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): vocab.add(word) unfiltered_vocab_size = vocab.size() vocab.filter_tokens_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) # logger.info('Assigning embeddings...') # vocab.randomly_init_embeddings(args.embed_size) #random init in prepare!! #save the datasets to records files. logger.info('Saving the datasets.') brc_data.convert_to_ids(vocab) pad_id = vocab.get_id(vocab.pad_token) brc_data.save_records(pad_id) logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) logger.info('Done with preparing!')
def train(args): """ 训练阅读理解模型 """ logger = logging.getLogger("brc") logger.info('加载数据集和词汇表...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files) logger.info('词语转化为id序列...') brc_data.convert_to_ids(vocab) logger.info('初始化模型...') rc_model = RCModel(vocab, args) logger.info('训练模型...') rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout_keep_prob=args.dropout_keep_prob) logger.info('训练完成!')
def train(args): """ trains the reading comprehension model """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Initialize the model...') rc_model = RCModel(vocab, args) logger.info('Training the model...') rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout_keep_prob=args.dropout_keep_prob) logger.info('Done with model training!')
def evaluate(args): """ evaluate the trained model on dev files """ with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) assert len(args.dev_files) > 0, 'No dev files are provided.' brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, dev_files=args.dev_files) brc_data.convert_to_ids(vocab) rc_model = RCModel(vocab, args) rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo + '_7') dev_batches = brc_data.gen_mini_batches('dev', args.batch_size, pad_id=vocab.get_id( vocab.pad_token), shuffle=False) bleu_rouge = rc_model.evaluate(dev_batches)
def predict(args): """ predicts answers for test files """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) assert len(args.test_files) > 0, 'No test files are provided.' brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, test_files=args.test_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Restoring the model...') rc_model = RCModel(vocab, args) rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('Predicting answers for test set...') test_batches = brc_data.gen_mini_batches('test', args.batch_size, pad_id=vocab.get_id(vocab.pad_token), shuffle=False) rc_model.evaluate(test_batches, result_dir=args.result_dir, result_prefix='test.predicted')
def train(args): """ trains the reading comprehension model """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) if args.word2vec_path: logger.info('learn_word_embedding:{}'.format(args.learn_word_embedding)) logger.info('loadding %s \n' % args.word2vec_path) word2vec = gensim.models.Word2Vec.load(args.word2vec_path) vocab.load_pretrained_embeddings_from_w2v(word2vec.wv) logger.info('load pretrained embedding from %s done\n' % args.word2vec_path) if args.use_char_embed: with open(os.path.join(args.vocab_dir, 'char_vocab.data'), 'rb') as fin: char_vocab = pickle.load(fin) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files) steps_per_epoch = brc_data.size('train') // args.batch_size args.decay_steps = args.decay_epochs * steps_per_epoch logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) if args.use_char_embed: logger.info('Converting text into char ids...') brc_data.convert_to_char_ids(char_vocab) logger.info('Binding char_vocab to args to pass to RCModel') args.char_vocab = char_vocab RCModel = choose_model_by_gpu_setting(args) logger.info('Initialize the model...') rc_model = RCModel(vocab, args) logger.info('Training the model...{}'.format(RCModel.__name__)) rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout_keep_prob=args.dropout_keep_prob) logger.info('Done with model training!')
def predict(args): """ predicts answers for test files """ with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) assert len(args.test_files) > 0, 'No test files are provided.' brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, test_files=args.test_files) brc_data.convert_to_ids(vocab) rc_model = RCModel(vocab, args) rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) test_batches = brc_data.gen_mini_batches('test', args.batch_size, pad_id=vocab.get_id( vocab.pad_token), shuffle=False) rc_model.evaluate(test_batches, result_dir=args.result_dir, result_prefix='test.predicted')
def evaluate(args): """ evaluate the trained model on dev files """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) assert len(args.dev_files) > 0, 'No dev files are provided.' brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, dev_files=args.dev_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Restoring the model...') rc_model = RCModel(vocab, args) rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('Evaluating the model on dev set...') dev_batches = brc_data.gen_mini_batches('dev', args.batch_size, pad_id=vocab.get_id(vocab.pad_token), shuffle=False) dev_loss, dev_bleu_rouge = rc_model.evaluate( dev_batches, result_dir=args.result_dir, result_prefix='dev.predicted') logger.info('Loss on dev set: {}'.format(dev_loss)) logger.info('Result on dev set: {}'.format(dev_bleu_rouge)) logger.info('Predicted answers are saved to {}'.format(os.path.join(args.result_dir)))
def predict(args): """ predicts answers for test files """ logger = logging.getLogger("rc") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: data_vocabs = pickle.load(fin) assert args.test_file, 'No test files are provided.' brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, test_file=args.test_file) logger.info('Converting text into ids...') brc_data.convert_to_ids(data_vocabs) logger.info('Restoring the model...') args = pickle.load(open(args.args_file, 'rb')) args.pos_size = data_vocabs.pos_vocab.size() args.ner_size = data_vocabs.ner_vocab.size() rc_model = DrqaModel(data_vocabs.word_vocab, args, eva=True) rc_model.evaluate(brc_data)
def predict(args): """ predicts answers for test files """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) assert len(args.test_files) > 0, 'No test files are provided.' brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.max_word_len, test_files=args.test_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Restoring the model...') rc_model = RCModel(vocab, args) rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) rc_model.finalize() # 增加完所有操作后采用sess.graph.finalize() # 来使得整个graph变为只读的 # 注意:tf.train.Saver() # 也算是往graph中添加node, 所以也必须放在finilize前 # 但是,,tf.train.Saver() # 只会存储 # 在该Saver声明时已经存在的变量!!! logger.info('Predicting answers for test set...') test_batches = brc_data.gen_mini_batches('test', args.batch_size, pad_id=vocab.get_id( vocab.pad_token), shuffle=False) rc_model.evaluate(test_batches, result_dir=args.result_dir, result_prefix='test.predicted')
def predict(args): """ predicts answers for test files """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') with open(args.vocab_path, 'rb') as fin: vocab = pickle.load(fin) assert len(args.test_files) > 0, 'No test files are provided.' brc_data = BRCDataset(args.algo, args.max_p_num, args.max_p_len, args.max_q_len, args.max_a_len, test_files=args.test_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Restoring the model...') rc_model = RCModel(vocab, args) rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('Predicting answers for test set...') test_batches = brc_data.gen_mini_batches('test', args.batch_size, pad_id=vocab.get_id( vocab.pad_token), shuffle=False) if args.algo == 'YESNO': qa_resultPath = args.test_files[0] #只会有一个文件! (filepath, tempfilename) = os.path.split(qa_resultPath) (qarst_filename, extension) = os.path.splitext(tempfilename) result_prefix = qarst_filename else: result_prefix = 'test.predicted.qa' rc_model.evaluate(test_batches, result_dir=args.result_dir, result_prefix=result_prefix) if args.algo == 'YESNO': #将YESNO结果合并入QA结果 qa_resultPath = args.test_files[0] #只会有一个文件! yesno_resultPath = args.result_dir + '/' + result_prefix + '.YESNO.json' out_file_path = args.result_dir + '/' + result_prefix + '.134.class.' + str( args.run_id) + '.json' #首先载入YESNO部分的预测结果 yesno_records = {} with open(yesno_resultPath, 'r') as f_in: for line in f_in: sample = json.loads(line) yesno_records[sample['question_id']] = line total_rst_num = 0 with open(qa_resultPath, 'r') as f_in: with open(out_file_path, 'w') as f_out: for line in f_in: total_rst_num += 1 sample = json.loads(line) if sample['question_id'] in yesno_records: line = yesno_records[sample['question_id']] f_out.write(line) print('total rst num : ', total_rst_num) print('yes no label combining done!')
def train(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger("brc") logger.info('Checking the data files...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format( data_path) logger.info('Preparing the directories...') for dir_path in [args.vocab_dir, args.model_dir, args.result_dir]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('Building vocabulary...') brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.max_train_sample_num, args.train_files) vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): vocab.add(word) unfiltered_vocab_size = vocab.size() vocab.filter_tokens_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) logger.info('Assigning embeddings...') vocab.load_pretrained_embeddings(args.word_embedding_path) #vocab.randomly_init_embeddings(300) #vocab1.randomly_init_embeddings(300) logger.info('Saving vocab...') brc_data.convert_to_ids(vocab) logger.info('Initialize the model...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) rc_model = S_netModel(vocab, args) logger.info('Training the model...') #rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo +'sys') #if args.train_as: # rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo + 'syst') rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout_keep_prob=args.dropout_keep_prob) logger.info('Done with model training!') logger.info('evaluate the trained model!') test_batches = brc_data.gen_mini_batches('test', args.batch_size, pad_id=vocab.get_id( vocab.pad_token), shuffle=False) rc_model.evaluate(test_batches, result_dir=args.result_dir, result_prefix='test.predicted') logger.info('Done with model evaluating !')
def train(logger, args): """train a model""" logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: if six.PY2: vocab = pickle.load(fin) else: vocab = pickle.load(fin, encoding='bytes') logger.info('vocab size is {} and embed dim is {}'.format( vocab.size(), vocab.embed_dim)) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.trainset, args.devset) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Initialize the model...') if not args.use_gpu: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() # build model main_program = fluid.Program() startup_prog = fluid.Program() if args.enable_ce: main_program.random_seed = args.random_seed startup_prog.random_seed = args.random_seed with fluid.program_guard(main_program, startup_prog): with fluid.unique_name.guard(): avg_cost, s_probs, e_probs, match, feed_order = rc_model.rc_model( args.hidden_size, vocab, args) # clone from default main program and use it as the validation program inference_program = main_program.clone(for_test=True) # build optimizer if args.optim == 'sgd': optimizer = fluid.optimizer.SGD( learning_rate=args.learning_rate) elif args.optim == 'adam': optimizer = fluid.optimizer.Adam( learning_rate=args.learning_rate) elif args.optim == 'rprop': optimizer = fluid.optimizer.RMSPropOptimizer( learning_rate=args.learning_rate) else: logger.error('Unsupported optimizer: {}'.format(args.optim)) exit(-1) if args.weight_decay > 0.0: obj_func = avg_cost + args.weight_decay * l2_loss(main_program) optimizer.minimize(obj_func) else: obj_func = avg_cost optimizer.minimize(obj_func) # initialize parameters place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = Executor(place) if args.load_dir: logger.info('load from {}'.format(args.load_dir)) fluid.io.load_persistables(exe, args.load_dir, main_program=main_program) else: exe.run(startup_prog) embedding_para = fluid.global_scope().find_var( 'embedding_para').get_tensor() embedding_para.set(vocab.embeddings.astype(np.float32), place) # prepare data feed_list = [ main_program.global_block().var(var_name) for var_name in feed_order ] feeder = fluid.DataFeeder(feed_list, place) logger.info('Training the model...') parallel_executor = fluid.ParallelExecutor( main_program=main_program, use_cuda=bool(args.use_gpu), loss_name=avg_cost.name) print_para(main_program, parallel_executor, logger, args) for pass_id in range(1, args.pass_num + 1): pass_start_time = time.time() pad_id = vocab.get_id(vocab.pad_token) if args.enable_ce: train_reader = lambda: brc_data.gen_mini_batches( 'train', args.batch_size, pad_id, shuffle=False) else: train_reader = lambda: brc_data.gen_mini_batches( 'train', args.batch_size, pad_id, shuffle=True) train_reader = read_multiple(train_reader, dev_count) log_every_n_batch, n_batch_loss = args.log_interval, 0 total_num, total_loss = 0, 0 for batch_id, batch_list in enumerate(train_reader(), 1): feed_data = batch_reader(batch_list, args) fetch_outs = parallel_executor.run( feed=list(feeder.feed_parallel(feed_data, dev_count)), fetch_list=[obj_func.name], return_numpy=False) cost_train = np.array(fetch_outs[0]).mean() total_num += args.batch_size * dev_count n_batch_loss += cost_train total_loss += cost_train * args.batch_size * dev_count if args.enable_ce and batch_id >= 100: break if log_every_n_batch > 0 and batch_id % log_every_n_batch == 0: print_para(main_program, parallel_executor, logger, args) logger.info( 'Average loss from batch {} to {} is {}'.format( batch_id - log_every_n_batch + 1, batch_id, "%.10f" % (n_batch_loss / log_every_n_batch))) n_batch_loss = 0 if args.dev_interval > 0 and batch_id % args.dev_interval == 0: if brc_data.dev_set is not None: eval_loss, bleu_rouge = validation( inference_program, avg_cost, s_probs, e_probs, match, feed_order, place, dev_count, vocab, brc_data, logger, args) logger.info('Dev eval loss {}'.format(eval_loss)) logger.info( 'Dev eval result: {}'.format(bleu_rouge)) pass_end_time = time.time() time_consumed = pass_end_time - pass_start_time logger.info('epoch: {0}, epoch_time_cost: {1:.2f}'.format( pass_id, time_consumed)) logger.info( 'Evaluating the model after epoch {}'.format(pass_id)) if brc_data.dev_set is not None: eval_loss, bleu_rouge = validation(inference_program, avg_cost, s_probs, e_probs, match, feed_order, place, dev_count, vocab, brc_data, logger, args) logger.info('Dev eval loss {}'.format(eval_loss)) logger.info('Dev eval result: {}'.format(bleu_rouge)) else: logger.warning( 'No dev set is loaded for evaluation in the dataset!') logger.info('Average train loss for epoch {} is {}'.format( pass_id, "%.10f" % (1.0 * total_loss / total_num))) if pass_id % args.save_interval == 0: model_path = os.path.join(args.save_dir, str(pass_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(executor=exe, dirname=model_path, main_program=main_program) if args.enable_ce: # For CE print("kpis\ttrain_cost_card%d\t%f" % (dev_count, total_loss / total_num)) if brc_data.dev_set is not None: print("kpis\ttest_cost_card%d\t%f" % (dev_count, eval_loss)) print("kpis\ttrain_duration_card%d\t%f" % (dev_count, time_consumed))
def train(args): """ trains the reading comprehension model """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Initialize the model...') rc_model = RCModel(vocab, args) logger.info('Training the model...') ####重新加载模型进行训练 file_pre = args.model_dir with tf.Session() as sess: saver = tf.train.Saver() try: saver.restore(sess, file_pre + config['model_name']) except: pass def train_epoch(train_batches, dropout_keep_prob): total_num, total_loss = 0, 0 log_every_n_batch, n_batch_loss = 50, 0 for bitx, batch in enumerate(train_batches, 1): feed_dict = { rc_model.p: batch['passage_token_ids'], rc_model.q: batch['question_token_ids'], rc_model.p_length: batch['passage_length'], rc_model.q_length: batch['question_length'], rc_model.start_label: batch['start_id'], rc_model.end_label: batch['end_id'], rc_model.dropout_keep_prob: dropout_keep_prob } _, loss = rc_model.sess.run([rc_model.train_op, rc_model.loss], feed_dict) total_loss += loss * len(batch['raw_data']) total_num += len(batch['raw_data']) n_batch_loss += loss if log_every_n_batch > 0 and bitx % log_every_n_batch == 0: rc_model.logger.info( 'Average loss from batch {} to {} is {}'.format( bitx - log_every_n_batch + 1, bitx, n_batch_loss / log_every_n_batch)) n_batch_loss = 0 return 1.0 * total_loss / total_num def train(self, data, epochs, batch_size, file_pre, config, dropout_keep_prob=1.0, evaluate=True): pad_id = self.vocab.get_id(self.vocab.pad_token) max_bleu_4 = 0 for epoch in range(1, epochs): rc_model.logger.info( 'Training the model for epoch {}'.format(epoch)) train_batches = data.gen_mini_batches('train', batch_size, pad_id, shuffle=True) train_loss = train_epoch(train_batches, dropout_keep_prob) rc_model.logger.info( 'Average train loss for epoch {} is {}'.format( epoch, train_loss)) if evaluate: logger.info( 'Evaluating the model after epoch {}'.format(epoch)) if data.dev_set is not None: eval_batches = data.gen_mini_batches('dev', batch_size, pad_id, shuffle=False) eval_loss, bleu_rouge = rc_model.evaluate(eval_batches) logger.info('Dev eval loss {}'.format(eval_loss)) rc_model.logger.info( 'Dev eval result: {}'.format(bleu_rouge)) if bleu_rouge['Bleu-4'] > max_bleu_4: rc_model.save(file_pre + config['model_name']) max_bleu_4 = bleu_rouge['Bleu-4'] else: rc_model.logger.warning( 'No dev set is loaded for evaluation in the dataset!' ) else: # self.save(save_dir, save_prefix + '_' + str(epoch))不保存每一个轮次的校验结果 rc_model.save(file_pre + config['model_name']) train(brc_data, args.epochs, args.batch_size, file_pre, config, dropout_keep_prob=args.dropout_keep_prob) logger.info('Done with model training!')