def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger("brc") logger.info('Checking the data files...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format( data_path) logger.info('Preparing the directories...') for dir_path in [ args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir ]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('Building vocabulary...') brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len, args.max_word_len, args.train_files, args.dev_files, args.test_files) vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): vocab.add(word) for word in brc_data.word_iter('test'): vocab.add(word) for word in brc_data.word_iter('dev'): vocab.add(word) unfiltered_vocab_size = vocab.size() vocab.filter_tokens_by_cnt(min_cnt=5) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) logger.info('Assigning embeddings...') vocab.randomly_init_embeddings(args.embed_size) #vocab.load_pretrained_embeddings(args.embedding_path) if args.use_char_level: for char in brc_data.char_iter('train'): vocab.add(char) for char in brc_data.char_iter('test'): vocab.add(char) for char in brc_data.char_iter('dev'): vocab.add(char) unfiltered_char_vocab_size = vocab.char_size() vocab.filter_chars_by_cnt(min_cnt=5) filtered_num = unfiltered_char_vocab_size - vocab.char_size() logger.info('After filter {} chars, the final vocab size is {}'.format( filtered_num, vocab.char_size())) logger.info('Assigning char embeddings...') vocab.randomly_init_char_embeddings(args.char_embed_size) #vocab.load_pretrained_char_embeddings(args.char_embedding_path) logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) logger.info('Done with preparing!')
def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger("brc") logger.info('Checking the data files...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format(data_path) logger.info('Preparing the directories...') for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]: if not os.path.exists(dir_path): os.makedirs(dir_path) logger.info('Building vocabulary...') #建立数据集 brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,args.word_size, args.train_files, args.dev_files, args.test_files) vocab = Vocab(lower=True) for word in brc_data.word_iter('train'): #将问题和选中答案的所有词作为vocabulary vocab.add(word) unfiltered_vocab_size = vocab.size() vocab.filter_tokens_by_cnt(min_cnt=1) #5 filtered_num = unfiltered_vocab_size - vocab.size() #得到过滤了多少个单词 logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num, vocab.size())) logger.info('Assigning embeddings...') #vocab.randomly_init_embeddings(args.embed_size) vocab.word2vec_init_embeddings(args.embed_size,sentences=brc_data.generate_word2vec_trainset(),min_cnt=1) #vocab.word2vec_init_embeddings(args.embed_size,sentences=brc_data.generate_word2vec_testset(),min_cnt=1) #vocab.glove_init_embeddings(args.embed_size) char_vocab=Vocab(lower=True) for char in brc_data.char_iter('train'): char_vocab.add(char) unfiltered_vocab_size=char_vocab.size() char_vocab.filter_tokens_by_cnt(min_cnt=1) filtered_num=unfiltered_vocab_size-char_vocab.size() logger.info('after filter {} tokens,the char vocab size is {}'.format(filtered_num,char_vocab.size())) char_vocab.char2vec_init_embeddings(args.char_embed_size,sentences=brc_data.generate_char2vec_trainset(),min_cnt=1) logger.info('Saving vocab...') with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout: pickle.dump(vocab, fout) with open(os.path.join(args.vocab_dir,'char_vocab.data'),'wb') as fout: pickle.dump(char_vocab,fout) logger.info('Done with preparing!')