logger = logging.getLogger("brc") logger.setLevel(logging.INFO) brc_data = DatasetReader( test_file=args.input, bert_dir='/home/wujindou/chinese_L-12_H-768_A-12', # prefix='bert_meizhuang' #test_file = None, ) from data.vocab import Vocab vocab = Vocab(lower=True) import sys for word in brc_data.word_iter(None): vocab.add(word) for char in word: vocab.add_char(char) logger.info(' char size {}'.format(vocab.get_char_vocab_size())) logger.info(' vocab size {} '.format(vocab.get_word_vocab())) # unfiltered_vocab_size = vocab.size() unfiltered_char_size = vocab.get_char_vocab_size() vocab.filter_tokens_by_cnt(min_cnt=2) vocab.filter_chars_by_cnt(min_cnt=2) filtered_num = unfiltered_vocab_size - vocab.size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) filtered_num = unfiltered_char_size - vocab.get_char_vocab_size() logger.info('After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.get_char_vocab_size())) logger.info('after load embedding vocab size is {}'.format(vocab.size()))
from flask import render_template from tensorflow.python.keras.backend import set_session import requests import sys sys.path.append('../') import os app = Flask(__name__) from data.vocab import Vocab os.environ["CUDA_VISIBLE_DEVICES"] = " " vocab_file = '../examples/politic_vocab5.txt' # vocab.load_from_file('vocab_bool.txt') vocab = Vocab(lower=True) from data.data_reader_new import DatasetReader from model.text_cnn import TextCNN if os.path.exists(vocab_file): vocab.load_from_file(vocab_file) print(vocab.get_word_vocab()) @app.route('/') def search_index(): return render_template('index.html') model = TextCNN(vocab, num_class=2, pretrained_word_embedding=vocab.embeddings, word_embedding_size=300) model.load( "/search/odin/jdwu/classification/cls_checkpoints/politic/best_weights")