def test_predict(self): ues_word = False if ues_word: tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level else: tokenizer = lambda x: [y for y in x] # char-level if os.path.exists(config.vocab_path): vocab = pkl.load(open(config.vocab_path, 'rb')) else: vocab = build_vocab(config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1) pkl.dump(vocab, open(config.vocab_path, 'wb')) config.n_vocab = len(vocab) model = FastText.Model(config) map_location = torch.device('cpu') model.load_state_dict( torch.load(config.save_path, map_location=map_location)) _test_sentance = sentance2ids(['2岁男童爬窗台不慎7楼坠下获救(图)'], config) _test_sentance = build_iterator(_test_sentance, config) for X, y in _test_sentance: result = model(X) result = torch.max(result.data, 1)[1].cpu().numpy() result = list(result) print(result[0]) self.assertTrue(isinstance(result, list))
from utils_fasttext import build_dataset, build_iterator, get_time_dif embedding = 'random' else: from utils import build_dataset, build_iterator, get_time_dif x = import_module('models.' + model_name) config = x.Config(dataset, embedding) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed_all(1) torch.backends.cudnn.deterministic = True # 保证每次结果一样 start_time = time.time() print("Loading data...") vocab, train_data, dev_data, test_data = build_dataset(config, word) train_iter = build_iterator(train_data, config) dev_iter = build_iterator(dev_data, config) test_iter = build_iterator(test_data, config) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) # train config.n_vocab = len(vocab) # to 是为了去除device model = x.Model(config).to(config.device) if model_name != 'Transformer': init_network(model) print(model.parameters) train(config, model, train_iter, dev_iter, test_iter)
config = x.Config(dataset, embedding) config.batch_size = 1 model = x.Model(config).to(config.device) model.load_state_dict(torch.load(config.save_path)) model.eval() def sentence_to_index(sentence, pad_size=32): index = [] vocab = pkl.load(open(config.vocab_path, 'rb')) tokenizer = lambda x: [y for y in x] lin = sentence.strip() content, label = lin.split('\t') words_line = [] token = tokenizer(content) seq_len = len(token) if pad_size: if len(token) < pad_size: token.extend([PAD] * (pad_size - len(token))) else: token = token[:pad_size] seq_len = pad_size # word to id for word in token: words_line.append(vocab.get(word, vocab.get(UNK))) index.append((words_line, int(label), seq_len)) return index for i, (trains, labels) in enumerate(build_iterator(sentence_to_index('学校有哪些体育设施\t0', config.pad_size), config)): predict = model(trains).data.max(1, keepdim=True)[1] print(predict)
ues_word = False if ues_word: tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level else: tokenizer = lambda x: [y for y in x] # char-level if os.path.exists(config.vocab_path): vocab = pkl.load(open(config.vocab_path, 'rb')) else: vocab = build_vocab(config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1) pkl.dump(vocab, open(config.vocab_path, 'wb')) config.n_vocab = len(vocab) model = FastText.Model(config) map_location = torch.device('cpu') model.load_state_dict(torch.load(config.save_path, map_location=map_location)) _test_sentance = sentance2ids(['北上资金卷土重来:热捧业绩预增 大幅增仓农业股'], config) _test_sentance = build_iterator(_test_sentance, config) for X, y in _test_sentance: result = model(X) result = torch.max(result.data, 1)[1].cpu().numpy() result = list(result) classify = [ 'finance', 'realty', 'stocks', 'education', 'science', 'society', 'politics', 'sports', 'game', 'entertainment' ] print(classify[result[0]])
model_name = args.model # 'TextRCNN' # TextCNN, TextRNN, FastText, TextRCNN, TextRNN_Att, DPCNN, Transformer if model_name == 'FastText': from utils_fasttext import build_dataset, build_iterator, get_time_dif embedding = 'random' else: from utils import build_dataset, build_iterator, get_time_dif x = import_module('models.' + model_name) config = x.Config(dataset, embedding) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed_all(1) torch.backends.cudnn.deterministic = True # 保证每次结果一样 start_time = time.time() print("Loading data...") w2v_vec, train_data, dev_data, test_data = build_dataset(config, args.word) train_iter = build_iterator(train_data, config, w2v_vec) dev_iter = build_iterator(dev_data, config, w2v_vec) test_iter = build_iterator(test_data, config, w2v_vec) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) # train config.n_vocab = len(w2v_vec.wv.vocab) model = x.Model(config).to(config.device) if model_name != 'Transformer': init_network(model) print(model.parameters) train(config, model, train_iter, dev_iter, test_iter)