def train(args): """ trains the reading comprehension model """ logger = logging.getLogger("brc") # 加载数据集 和 辞典(prepare保存的) logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) # pickle python的标准模块 --prepare运行时vocab的对象信息读取 brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files) # 最大 文章数,文章长度,问题长度, # train时候只有训练文件,验证文件 # 利用vocab 把brc_data 转换 成 id logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) # 把原始数据的问题和文章的单词转换成辞典保存的id # 初始化神经网络 logger.info('Initialize the model...') rc_model = RCModel(vocab, args) logger.info('Training the model...') """ Train the model with data Args: data: the BRCDataset class implemented in dataset.py epochs: number of training epochs batch_size: save_dir: the directory to save the model save_prefix: the prefix indicating the model type dropout_keep_prob: float value indicating dropout keep probability evaluate: whether to evaluate the model on test set after each epoch """ rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout_keep_prob=args.dropout_keep_prob) logger.info('Done with model training!')
def evaluate(args): """ predicts answers for test files """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) # assert len(args.test_files) > 0, 'No test files are provided.' # brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.max_train_sample_num,args.test_files, use_type="test") brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.max_train_sample_num, args.dev_files, use_type="dev") logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) rc_model = S_netModel(vocab, args) logger.info('Restoring the model...') rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('evaluate answers for dev set...') test_batches = brc_data.gen_mini_batches('dev', args.batch_size, pad_id=vocab.get_id( vocab.pad_token), shuffle=False) #rc_model.predict(test_batches,result_dir=args.result_dir, result_prefix=args.result_prefix) rc_model.evaluate(test_batches, result_dir=args.result_dir, result_prefix=args.result_prefix)
def train(args): """ trains the reading comprehension model """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) # logger.info('Assigning embeddings...') # vocab.embed_dim = args.embed_size # vocab.load_pretrained_embeddings(args.embedding_path) logger.info('Vocabulary %s' % vocab.size()) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, vocab, args.train_files, args.dev_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Initialize the model...') rc_model = RCModel(vocab, args) # rc_model = MTRCModel(vocab, args) logger.info('Training the model...') rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout_keep_prob=args.dropout_keep_prob) logger.info('Done with model training!')
def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger("brc") logger.info('Checking the data files...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format(data_path) logger.info('Preparing the directories...') for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('Building vocabulary...') brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files, args.test_files) vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): vocab.add(word) unfiltered_vocab_size = vocab.size() vocab.filter_tokens_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num, vocab.size())) logger.info('Assigning embeddings...') vocab.randomly_init_embeddings(args.embed_size) logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) logger.info('Done with preparing!')
def train(args): logger = logging.getLogger("QAPointNet") logger.info("====== training ======") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) dataloader = BRCDataset(args.max_p_num,args.max_p_len, args.max_q_len,args.save_dir, args.train_files, args.dev_files) num_train_steps = int( len(dataloader.train_set) / args.batch_size * args.epochs) num_warmup_steps = int(num_train_steps * args.warmup_proportion) logger.info('Converting text into ids...') dataloader.convert_to_ids(vocab) logger.info('Initialize the model...') model = RCModel(vocab, num_train_steps, num_warmup_steps, args) del vocab logger.info('Training the model...') model.train(dataloader, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout=args.dropout) logger.info('====== Done with model training! ======')
def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format( data_path) for dir_path in [args.vocab_dir, args.model_dir]: if not os.path.exists(dir_path): os.makedirs(dir_path) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files) vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): vocab.add(word) # unfiltered_vocab_size = vocab.size() print("vocab size is ", vocab.size()) vocab.filter_tokens_by_cnt(min_cnt=2) print("after filtered vocab size is ", vocab.size()) # filtered_num = unfiltered_vocab_size - vocab.size() vocab.randomly_init_embeddings(args.embed_size) if args.use_pre_train: vocab.load_pretrained_embeddings(args.pre_train_file) with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout)
def prepare(args): logger = logging.getLogger("rc") logger.info('train test split...') train_test_split(args.all_file, args.train_file, args.test_file, args.train_rate) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_file, args.dev_file, args.test_file) data_vocabs = DataVocabs() for word, pos, ner in brc_data.word_iter('train'): data_vocabs.word_vocab.add(word) data_vocabs.pos_vocab.add(pos) data_vocabs.ner_vocab.add(ner) unfiltered_vocab_size = data_vocabs.word_vocab.size() data_vocabs.word_vocab.filter_tokens_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - data_vocabs.word_vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, data_vocabs.word_vocab.size())) logger.info('Assigning embeddings...') # vocab.randomly_init_embeddings(args.embed_size) data_vocabs.word_vocab.load_pretrained_embeddings(args.embedding_path) logger.info('embedding size: {}, {}'.format( len(data_vocabs.word_vocab.embeddings), len(data_vocabs.word_vocab.embeddings[0]))) logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(data_vocabs, fout) logger.info('Done with preparing!')
def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger("Cail") logger.info('Checking the data files...') print("Checking the data files...") for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format(data_path) logger.info('Preparing the directories...') logger.info('Building vocabulary...') brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files, args.test_files) # 创建这个类 vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): # 构建词典只包含训练集 vocab.add(word) logger.info("Tokens num {}".format(vocab.size())) unfiltered_vocab_size = vocab.size() vocab.filter_tokens_by_cnt(min_cnt=5) # 过滤低频token filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num, vocab.size())) logger.info('Assigning embeddings...') vocab.randomly_init_embeddings(args.embed_size) # 随机初始化的embedding 第一步 # vocab.load_pretrained_embeddings(args.embedding_path) # glove pre-trained 第二步 提取vocab # logger.info("Vocab size is {} from embedding".format(vocab.size())) logger.info('Saving vocab...') with open(args.vocab_path, 'wb') as fout: # vocab存入 pickle.dump(vocab, fout) logger.info('Done with preparing!')
def predict(args): """ 预测测试文件的答案 """ logger = logging.getLogger("brc") logger.info('加载数据集和词汇表...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) assert len(args.test_files) > 0, '找不到测试文件.' brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, test_files=args.test_files) logger.info('把文本转化为id序列...') brc_data.convert_to_ids(vocab) logger.info('重载模型...') rc_model = RCModel(vocab, args) rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('预测测试集的答案...') test_batches = brc_data.gen_mini_batches('test', args.batch_size, pad_id=vocab.get_id( vocab.pad_token), shuffle=False) rc_model.evaluate(test_batches, result_dir=args.result_dir, result_prefix='test.predicted')
def train(args): """ trains the reading comprehension model """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') # 加载 vocab对象 ,包括 token2id id2token 以及其它方法 with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files) # brc_data.save_set_file(brc_data.dev_set, './save_sets', 'dev_set') # brc_data.save_set_file(brc_data.test_set, './save_sets', 'test_set') # brc_data.save_set_file(brc_data.train_set, './save_sets', 'train_set') logger.info('Converting text into ids...') # [self.train_set, self.dev_set, self.test_set] 原始数据 转为id形式 brc_data.convert_to_ids(vocab) logger.info('Initialize the model...') rc_model = RCModel(vocab, args) # 加载上次保存的模型 # rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) # **************************************************************** logger.info('Training the model...') rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout_keep_prob=args.dropout_keep_prob) logger.info('Done with model training!')
def train(args): """ 训练阅读理解模型 """ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger("brc") file_handler = logging.FileHandler(args.log_path) file_handler.setLevel(logging.INFO) file_handler.setFormatter(formatter) logger.addHandler(file_handler) console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter(formatter) logger.addHandler(console_handler) logger.info(args) logger.info('加载数据集和词汇表...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) brc_data = BRCDataset(args.max_p_len, args.max_q_len, args.train_files, args.dev_files) logger.info('词语转化为id序列...') brc_data.convert_to_ids(vocab) logger.info('初始化模型...') rc_model = RCModel(vocab, args) logger.info('训练模型...') rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout_keep_prob=args.dropout_keep_prob) logger.info('训练完成!')
def evaluate(args): """ 对训练好的模型进行验证 """ logger = logging.getLogger("brc") logger.info('加wudi...') logger.info('加载数据集和词汇表...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) assert len(args.dev_files) > 0, '找不到验证文件.' brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, dev_files=args.dev_files) logger.info('把文本转化为id序列...') brc_data.convert_to_ids(vocab) logger.info('重载模型...') rc_model = RCModel(vocab, args) rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('验证模型...') dev_batches = brc_data.gen_mini_batches('dev', args.batch_size, pad_id=vocab.get_id( vocab.pad_token), shuffle=False) dev_loss, dev_bleu_rouge = rc_model.evaluate(dev_batches, result_dir=args.result_dir, result_prefix='dev.predicted') logger.info('验证集上的损失为: {}'.format(dev_loss)) logger.info('验证集的结果: {}'.format(dev_bleu_rouge)) logger.info('预测的答案证保存到 {}'.format(os.path.join(args.result_dir)))
def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger("brc") logger.info('Checking the data files...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format( data_path) logger.info('Preparing the directories...') logger.info('Building vocabulary...') brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files, args.test_files) vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): #构建词典只包含训练集 vocab.add(word) unfiltered_vocab_size = vocab.size() vocab.filter_tokens_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) logger.info('Assigning embeddings...') # vocab.randomly_init_embeddings(args.embed_size)#TODO-load_pretrained_embeddings vocab.load_pretrained_embeddings(args.embedding_path) #glove pre-trained logger.info('Saving vocab...') # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: with open(args.vocab_path, 'wb') as fout: #不区分search&zhidao pickle.dump(vocab, fout) logger.info('Done with preparing!')
def evaluate(args): """ evaluate the trained model on dev files """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: with open(args.vocab_path, 'rb') as fin: vocab = pickle.load(fin) assert len(args.dev_files) > 0, 'No dev files are provided.' brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, dev_files=args.dev_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Restoring the model...') rc_model = RCModel(vocab, args) rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('Evaluating the model on dev set...') dev_batches = brc_data.gen_mini_batches('dev', args.batch_size, pad_id=vocab.get_id( vocab.pad_token), shuffle=False) dev_loss, dev_bleu_rouge = rc_model.evaluate(dev_batches, result_dir=args.result_dir, result_prefix='dev.predicted') logger.info('Loss on dev set: {}'.format(dev_loss)) logger.info('Result on dev set: {}'.format(dev_bleu_rouge)) logger.info('Predicted answers are saved to {}'.format( os.path.join(args.result_dir)))
def prepro(args): logger = logging.getLogger("QAPointNet") logger.info("====== preprocessing ======") logger.info('Checking the data files...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format(data_path) logger.info('Preparing the directories...') for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('Building vocabulary...') dataloader = BRCDataset(args.max_p_num,args.max_p_len, args.max_q_len,args.save_dir, args.train_files, args.dev_files, args.test_files, prepare=True) vocab = Vocab(lower=True) for word in dataloader.word_iter('train'): vocab.add_word(word) del dataloader unfiltered_vocab_size = vocab.word_size() vocab.filter_words_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.word_size() logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,vocab.word_size())) logger.info('Assigning embeddings...') if args.pretrained_word_path is not None: vocab.load_pretrained_word_embeddings(args.pretrained_word_path) else: vocab.randomly_init_word_embeddings(args.embed_size) logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) logger.info('====== Done with preparing! ======')
def prepare(args): """ 检查数据,创建目录,准备词汇表和词嵌入 checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger("brc") logger.info('检查数据文件...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} 文件不存在.'.format(data_path) logger.info('建立目录...') for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('创建词汇表...') brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files, args.test_files) vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): vocab.add(word) unfiltered_vocab_size = vocab.size() vocab.filter_tokens_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('过滤掉 {} 个词语, 最终的词汇量是 {}'.format(filtered_num, vocab.size())) logger.info('指定词向量...') vocab.randomly_init_embeddings(args.embed_size) logger.info('保存词汇表...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) logger.info('完成预备过程!')
def predict(args): """ predicts answers for test files """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: with open(args.vocab_path, 'rb') as fin: vocab = pickle.load(fin) assert len(args.test_files) > 0, 'No test files are provided.' brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, test_files=args.test_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Restoring the model...') rc_model = RCModel(vocab, args) rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('Predicting answers for test set...') test_batches = brc_data.gen_mini_batches('test', args.batch_size, pad_id=vocab.get_id( vocab.pad_token), shuffle=False) rc_model.evaluate(test_batches, result_dir=args.result_dir, result_prefix='test.predicted')
def predict(args): logger = logging.getLogger("QAPointNet") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) assert len(args.test_files) > 0, 'No test files are provided.' dataloader = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.save_dir, test_files=args.test_files) num_train_steps = int( len(dataloader.train_set) / args.batch_size * args.epochs) num_warmup_steps = int(num_train_steps * args.warmup_proportion) logger.info('Converting text into ids...') dataloader.convert_to_ids(vocab) logger.info('Restoring the model...') model = RCModel(vocab, num_train_steps, num_warmup_steps, args) model.restore(args.model_dir, 'BIDAF_18000') logger.info('Predicting answers for test set...') test_batches = dataloader.gen_mini_batches('test', 64, vocab.get_word_id( vocab.pad_token), shuffle=False) model.evaluate(test_batches, result_dir=args.result_dir, result_prefix='test.predicted')
def evaluate(args): logger = logging.getLogger("QAPointNet") logger.info("====== evaluating ======") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) assert len(args.dev_files) > 0, 'No dev files are provided.' dataloader = BRCDataset(args.max_p_num,args.max_p_len, args.max_q_len,args.save_dir, dev_files=args.dev_files) num_train_steps = int( len(dataloader.train_set) / args.batch_size * args.epochs) num_warmup_steps = int(num_train_steps * args.warmup_proportion) logger.info('Converting text into ids...') dataloader.convert_to_ids(vocab) logger.info('Restoring the model...') model = RCModel(vocab, num_train_steps,num_warmup_steps,args) model.restore(args.model_dir, 'BIDAF_42000') logger.info('Evaluating the model on dev set...') dev_batches = dataloader.gen_mini_batches('dev', 64, vocab.get_word_id(vocab.pad_token),shuffle=False) dev_loss, dev_bleu_rouge = model.evaluate( dev_batches, result_dir=args.result_dir, result_prefix='dev.predicted') logger.info('Loss on dev set: {}'.format(dev_loss)) logger.info('Result on dev set: {}'.format(dev_bleu_rouge)) logger.info('Predicted answers are saved to {}'.format(os.path.join(args.result_dir)))
def train(args): """ trains the reading comprehension model """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') # with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: with open(args.vocab_path, 'rb') as fin: vocab = pickle.load(fin) brc_data = BRCDataset(args.algo, args.max_p_num, args.max_p_len, args.max_q_len, args.max_a_len, args.train_files, args.dev_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Initialize the model...') rc_model = RCModel(vocab, args) if args.restore: logger.info('Restoring the model...') rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('Training the model...') rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout_keep_prob=args.dropout_keep_prob) logger.info('Done with model training!')
def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger("brc") logger.info('Checking the data files...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format(data_path) logger.info('Preparing the directories...') for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('Building vocabulary...') brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files, args.test_files) vocab = Vocab(lower=True) if args.use_char_embed: char_vocab = Vocab(lower=True) fout = open(os.path.join(args.vocab_dir, 'word.txt'), 'w') for word in brc_data.word_iter('train'): if word == 'mosi': print('xxxxxxxxxx:%s\n' % word) vocab.add(word) if args.use_char_embed: for char in list(word): char_vocab.add(char) fout.write(word + ', ' + ' '.join(list(word)) + '\n') fout.close() idx = vocab.get_id('mosi') print('yyyyyyyy:mosi_idx:%d\n' % idx) unfiltered_vocab_size = vocab.size() vocab.filter_tokens_by_cnt(min_cnt=args.min_cnt) logger.info('min_cnt = %d ' % args.min_cnt) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num, vocab.size())) #logger.info('The final char vocab size is {}'.format(char_vocab.size())) logger.info('Assigning embeddings...') vocab.randomly_init_embeddings(args.embed_size) if args.use_char_embed: char_vocab.randomly_init_embeddings(args.char_embed_size) logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) if args.use_char_embed: logger.info('Saving char vocab...') with open(os.path.join(args.vocab_dir, 'char_vocab.data'), 'wb') as fout: pickle.dump(char_vocab, fout) logger.info('Done with preparing!')
def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger("brc") logger.info('Building vocabulary...') # 载入数据 brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files, args.test_files, prepare=True) # 构建词典 vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): vocab.add(word) unfiltered_vocab_size = vocab.size() logger.info('Assigning embeddings...') # 1.随机初始化词向量 2.载入预训练词向量 if not args.pretrain: # 保留至少出现2次的token vocab.filter_tokens_by_cnt(min_cnt=2) vocab.randomly_init_embeddings(args.embed_size) else: pre_train(brc_data, args.segmented_dir) vocab.load_pretrained_embeddings( os.path.join(args.segmented_dir, 'w2v_dic.data')) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.pkl'), 'wb') as fout: pickle.dump(vocab, fout) fout.close() logger.info('Saving sets...') with open(os.path.join(args.prepared_dir, 'train_set.pkl'), 'wb') as f_train_out: pickle.dump(brc_data.train_set, f_train_out) f_train_out.close() with open(os.path.join(args.prepared_dir, 'dev_set.pkl'), 'wb') as f_dev_out: pickle.dump(brc_data.dev_set, f_dev_out) f_dev_out.close() with open(os.path.join(args.prepared_dir, 'test_set.pkl'), 'wb') as f_test_out: pickle.dump(brc_data.test_set, f_test_out) f_test_out.close() logger.info('Done with preparing!')
def prepare(logger, args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger.info('Checking the data files...') for data_path in args.trainset + args.devset + args.testset: assert os.path.exists(data_path), '{} file does not exist.'.format( data_path) logger.info('Preparing the directories...') """ args.vocab_dir: ../data/vocab, help="vocabulary". args.save_dir: ../data/models, help="Specify the path to save trained models". args.result_dir: ../data/results/, help="the dir to output the results". """ for dir_path in [args.vocab_dir, args.save_dir, args.result_dir]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('Building vocabulary...') """ max_p_num=5 max_p_len=500 max_q_len=60 """ brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.trainset, args.devset, args.testset) vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): vocab.add(word) unfiltered_vocab_size = vocab.size() vocab.filter_tokens_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) logger.info('Assigning embeddings...') # vocab.randomly_init_embeddings(args.embed_size) # embed_size (default) = 300 vocab.get_vector(args.embed_size) # embed_size (default) = 300 logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) logger.info('Done with preparing!')
def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger("brc") logger.info('Checking the data files...') # 确保路径存在 # train_files dev_files test_files for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format( data_path) # 准备保存数据的目录,如果目录不存在,则创建 # vocab_dir model_dir result_dir summary_dir logger.info('Preparing the directories...') for dir_path in [ args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir ]: if not os.path.exists(dir_path): os.makedirs(dir_path) # 准备数据 # 传入最大p数量 最大p长度 最大q长度 以及 训练 开发 测试文件目录 logger.info('Building vocabulary...') brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files, args.test_files) # 创建字典 vocab = Vocab(lower=True) # 得到所有的字词,创建 token2id id2token等 for word in brc_data.word_iter('train'): vocab.add(word) # 未过滤之前的大小 unfiltered_vocab_size = vocab.size() # 过滤词频低于2的词 vocab.filter_tokens_by_cnt(min_cnt=2) # 过滤的数量 = 未过滤前的数量 - 过滤后的数量 filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) # logger.info('Assigning embeddings...') # 随机初始化词向量 size * 300 # 5006 * 300 vocab.randomly_init_embeddings(args.embed_size) logger.info('Saving vocab...') # 保存词汇表 with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) logger.info('Done with preparing!')
def evaluate(logger, args): """evaluate a specific model using devset""" logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) logger.info('vocab size is {} and embed dim is {}'.format( vocab.size(), vocab.embed_dim)) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, dev_files=args.devset) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Initialize the model...') # build model main_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(main_program, startup_prog): with fluid.unique_name.guard(): avg_cost, s_probs, e_probs, match, feed_order = rc_model.rc_model( args.hidden_size, vocab, args) # initialize parameters if not args.use_gpu: place = fluid.CPUPlace() dev_count = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() exe = Executor(place) if args.load_dir: logger.info('load from {}'.format(args.load_dir)) fluid.io.load_persistables(exe, args.load_dir, main_program=main_program) else: logger.error('No model file to load ...') return inference_program = main_program.clone(for_test=True) eval_loss, bleu_rouge = validation(inference_program, avg_cost, s_probs, e_probs, match, feed_order, place, dev_count, vocab, brc_data, logger, args) logger.info('Dev eval loss {}'.format(eval_loss)) logger.info('Dev eval result: {}'.format(bleu_rouge)) logger.info('Predicted answers are saved to {}'.format( os.path.join(args.result_dir)))
def calculate_unk(train_files, target_files): brc_data = BRCDataset(5, 500, 60, train_files, target_files) vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): vocab.add(word) vocab.filter_tokens_by_cnt(min_cnt=2) overlap_num = 0 dev_vocab = set() for word in brc_data.word_iter('dev'): dev_vocab.add(word) for word in dev_vocab: if word in vocab.token2id: overlap_num += 1 print("over lap word is {} in {}".format(overlap_num, len(dev_vocab)))
def train(args): """ trains the reading comprehension model """ with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files) brc_data.convert_to_ids(vocab) rc_model = RCModel(vocab, args) rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo)
def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger("brc") logger.info('Building SIF Model...') sif_model = SIFModel(args, logger, True, args.train_files, args.dev_files, args.test_files) logger.info('Training word embeddings...') sif_model.train_embeddings() logger.info('Building pc and sif embeddings...') sif_model.build_pc_and_sif_embedding_list() sif_model.load_model() logger.info('Building vocabulary...') # 构建词典 vocab = Vocab(lower=True) # 载入预训练词向量 vocab.load_pretrained_embeddings( os.path.join(args.prepared_dir, 'w2v_dic.pkl')) logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.pkl'), 'wb') as fvocab: pickle.dump(vocab, fvocab) fvocab.close() # 构建数据集 logger.info('Loading dataset...') brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.prepared_dir, args.train_files, args.dev_files, args.test_files, prepare=True) logger.info('Done with preparing!')
def train(args): logger = logging.getLogger("rc") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: data_vocabs = pickle.load(fin) args.pos_size = data_vocabs.pos_vocab.size() args.ner_size = data_vocabs.ner_vocab.size() brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_file, args.dev_file, args.test_file) logger.info('Converting text into ids...') brc_data.convert_to_ids(data_vocabs) logger.info('Saving the args') pickle.dump(args, open(args.args_file, 'wb')) logger.info('Initialize the model...') rc_model = DrqaModel(data_vocabs.word_vocab, args) logger.info('Training the model...') rc_model.train(brc_data)
def train(args): """ 训练阅读理解模型 """ logger = logging.getLogger("brc") logger.info('加载数据集和词汇表...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files) logger.info('词语转化为id序列...') brc_data.convert_to_ids(vocab) logger.info('初始化模型...') rc_model = RCModel(vocab, args) logger.info('训练模型...') rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout_keep_prob=args.dropout_keep_prob) logger.info('训练完成!')
def train(args): """ trains the reading comprehension model """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.train_files, args.dev_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Initialize the model...') rc_model = RCModel(vocab, args) logger.info('Training the model...') rc_model.train(brc_data, args.epochs, args.batch_size, save_dir=args.model_dir, save_prefix=args.algo, dropout_keep_prob=args.dropout_keep_prob) logger.info('Done with model training!')
def predict(args): """ predicts answers for test files """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) assert len(args.test_files) > 0, 'No test files are provided.' brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, test_files=args.test_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Restoring the model...') rc_model = RCModel(vocab, args) rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('Predicting answers for test set...') test_batches = brc_data.gen_mini_batches('test', args.batch_size, pad_id=vocab.get_id(vocab.pad_token), shuffle=False) rc_model.evaluate(test_batches, result_dir=args.result_dir, result_prefix='test.predicted')
def evaluate(args): """ evaluate the trained model on dev files """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin: vocab = pickle.load(fin) assert len(args.dev_files) > 0, 'No dev files are provided.' brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, dev_files=args.dev_files) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab) logger.info('Restoring the model...') rc_model = RCModel(vocab, args) rc_model.restore(model_dir=args.model_dir, model_prefix=args.algo) logger.info('Evaluating the model on dev set...') dev_batches = brc_data.gen_mini_batches('dev', args.batch_size, pad_id=vocab.get_id(vocab.pad_token), shuffle=False) dev_loss, dev_bleu_rouge = rc_model.evaluate( dev_batches, result_dir=args.result_dir, result_prefix='dev.predicted') logger.info('Loss on dev set: {}'.format(dev_loss)) logger.info('Result on dev set: {}'.format(dev_bleu_rouge)) logger.info('Predicted answers are saved to {}'.format(os.path.join(args.result_dir)))