embedding_dim) train_iter = data.BucketIterator( dataset=train, batch_size=batch_size, sort_key=lambda x: len(x.Text1) + len(x.Text2), shuffle=True, device=device, repeat=False) valid_iter = data.Iterator(dataset=valid, batch_size=batch_size, device=device, shuffle=False, repeat=False) train_dl = datahelper.BatchWrapper(train_iter, ["Text1", "Text2", "Label"]) valid_dl = datahelper.BatchWrapper(valid_iter, ["Text1", "Text2", "Label"]) print('Reading data done.') def predict_on(model, data_dl, loss_func, device, model_state_path=None): if model_state_path: model.load_state_dict(torch.load(model_state_path)) print('Start predicting...') model.eval() res_list = [] label_list = [] loss = 0 for text1, text2, label in data_dl:
infile = "../../data/train.tsv" outfile = "../../data/predict.tsv" print('Reading data..') jieba.load_userdict("../txt/dict.txt") ID = data.Field(sequential=False, batch_first=True, use_vocab=False) TEXT = data.Field(sequential=True, lower=True, eos_token='<EOS>', init_token='<BOS>', pad_token='<PAD>', fix_length=None, batch_first=True, use_vocab=True, tokenize=jieba.lcut) LABEL = data.Field(sequential=False, batch_first=True, use_vocab=False) train = data.TabularDataset( path='../../data/train.tsv', format='tsv', fields=[('Id', ID), ('Text1', TEXT), ('Text2', TEXT), ('Label', LABEL)]) test = data.TabularDataset( path=infile, format='tsv', fields=[('Id', ID), ('Text1', TEXT), ('Text2', TEXT)]) TEXT.build_vocab(train, min_freq=3) print('Building vocabulary Finished.') word_matrix = datahelper.wordlist_to_matrix("../txt/embedding_300d.bin", TEXT.vocab.itos, device, embedding_dim) test_iter = data.Iterator(dataset=test, batch_size=batch_size, device=device, shuffle=False, repeat=False) test_dl = datahelper.BatchWrapper(test_iter, ["Id", "Text1", "Text2"]) MODEL = wide_deep(len(TEXT.vocab), embedding_dim, hidden_dim, batch_size, word_matrix, bidirectional=bidirectional) MODEL.to(device) predict_on(MODEL, test_dl, outfile, '../model_save/wide_deep.pth')