Ejemplo n.º 1
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)
    logger.info('Preparing the directories...')
    for dir_path in [
            args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir
    ]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.max_word_len, args.train_files, args.dev_files,
                          args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):
        vocab.add(word)
    for word in brc_data.word_iter('test'):
        vocab.add(word)
    for word in brc_data.word_iter('dev'):
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=5)
    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    logger.info('Assigning embeddings...')
    vocab.randomly_init_embeddings(args.embed_size)
    #vocab.load_pretrained_embeddings(args.embedding_path)

    if args.use_char_level:
        for char in brc_data.char_iter('train'):
            vocab.add(char)
        for char in brc_data.char_iter('test'):
            vocab.add(char)
        for char in brc_data.char_iter('dev'):
            vocab.add(char)
        unfiltered_char_vocab_size = vocab.char_size()
        vocab.filter_chars_by_cnt(min_cnt=5)
        filtered_num = unfiltered_char_vocab_size - vocab.char_size()
        logger.info('After filter {} chars, the final vocab size is {}'.format(
            filtered_num, vocab.char_size()))
        logger.info('Assigning char embeddings...')
        vocab.randomly_init_char_embeddings(args.char_embed_size)
        #vocab.load_pretrained_char_embeddings(args.char_embedding_path)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)
    logger.info('Done with preparing!')
Ejemplo n.º 2
0
def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger("brc")
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(data_path)
    logger.info('Preparing the directories...')
    for dir_path in [args.vocab_dir, args.model_dir, args.result_dir, args.summary_dir]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    logger.info('Building vocabulary...')
    #建立数据集
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,args.word_size,
                          args.train_files, args.dev_files, args.test_files)
    vocab = Vocab(lower=True)
    for word in brc_data.word_iter('train'):    #将问题和选中答案的所有词作为vocabulary
        vocab.add(word)

    unfiltered_vocab_size = vocab.size()
    vocab.filter_tokens_by_cnt(min_cnt=1)   #5
    filtered_num = unfiltered_vocab_size - vocab.size() #得到过滤了多少个单词
    logger.info('After filter {} tokens, the final vocab size is {}'.format(filtered_num,
                                                                            vocab.size()))

    logger.info('Assigning embeddings...')
    #vocab.randomly_init_embeddings(args.embed_size)
    vocab.word2vec_init_embeddings(args.embed_size,sentences=brc_data.generate_word2vec_trainset(),min_cnt=1)
    #vocab.word2vec_init_embeddings(args.embed_size,sentences=brc_data.generate_word2vec_testset(),min_cnt=1)
    #vocab.glove_init_embeddings(args.embed_size)
    char_vocab=Vocab(lower=True)
    for char in brc_data.char_iter('train'):
        char_vocab.add(char)

    unfiltered_vocab_size=char_vocab.size()
    char_vocab.filter_tokens_by_cnt(min_cnt=1)
    filtered_num=unfiltered_vocab_size-char_vocab.size()
    logger.info('after filter {} tokens,the char vocab size is {}'.format(filtered_num,char_vocab.size()))

    char_vocab.char2vec_init_embeddings(args.char_embed_size,sentences=brc_data.generate_char2vec_trainset(),min_cnt=1)

    logger.info('Saving vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'wb') as fout:
        pickle.dump(vocab, fout)
    with open(os.path.join(args.vocab_dir,'char_vocab.data'),'wb') as fout:
        pickle.dump(char_vocab,fout)
    logger.info('Done with preparing!')