Esempio n. 1
0
        prefix='bert_meizhuang'  #test_file = None,
    )
    from data.vocab import Vocab
    vocab = Vocab(lower=True)
    import sys
    for word in brc_data.word_iter(None):
        vocab.add(word)
        for char in word:
            vocab.add_char(char)
    logger.info(' char size {}'.format(vocab.get_char_vocab_size()))
    logger.info(' vocab size {} '.format(vocab.get_word_vocab()))
    #
    unfiltered_vocab_size = vocab.size()
    unfiltered_char_size = vocab.get_char_vocab_size()
    vocab.filter_tokens_by_cnt(min_cnt=2)
    vocab.filter_chars_by_cnt(min_cnt=2)

    filtered_num = unfiltered_vocab_size - vocab.size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.size()))

    filtered_num = unfiltered_char_size - vocab.get_char_vocab_size()
    logger.info('After filter {} tokens, the final vocab size is {}'.format(
        filtered_num, vocab.get_char_vocab_size()))

    logger.info('after load embedding vocab size is {}'.format(vocab.size()))

    brc_data.convert_to_ids(vocab)

    from model.bert_base import BertBaseline
        use_bert=False)
    from data.vocab import Vocab
    do_inference = True  #from data.vocab import Vocab
    vocab = Vocab(lower=True, prefix='third_level_baihuo_')
    if not do_inference:
        for word in brc_data.word_iter(None):
            vocab.add(word)
            for char in word:
                vocab.add_char(char)
        logger.info(' char size {}'.format(vocab.get_char_vocab_size()))
        logger.info(' vocab size {} '.format(vocab.get_word_vocab()))
        #
        unfiltered_vocab_size = vocab.size()
        unfiltered_char_size = vocab.get_char_vocab_size()
        vocab.filter_tokens_by_cnt(min_cnt=2)
        do_inference: vocab.filter_chars_by_cnt(min_cnt=2)
        filtered_num = unfiltered_vocab_size - vocab.size()
        logger.info(
            'After filter {} tokens, the final vocab size is {}'.format(
                filtered_num, vocab.size()))

        filtered_num = unfiltered_char_size - vocab.get_char_vocab_size()
        logger.info(
            'After filter {} tokens, the final vocab size is {}'.format(
                filtered_num, vocab.get_char_vocab_size()))
        # # sys.exit(1)

    import os
    vocab_file = 'first_third_baihuo_vocab.txt'  # vocab.load_from_file('vocab_bool.txt')
    if os.path.exists(vocab_file): vocab.load_from_file(vocab_file)
    if os.path.exists(vocab_file): vocab.load()