train = data.TabularDataset(path='../../data/train.tsv', format='tsv', fields=[('Id', ID), ('Text1', TEXT), ('Text2', TEXT), ('Label', LABEL)], skip_header=True) valid = data.TabularDataset(path='../../data/valid.tsv', format='tsv', fields=[('Id', ID), ('Text1', TEXT), ('Text2', TEXT), ('Label', LABEL)], skip_header=True) TEXT.build_vocab(train, min_freq=3) print('Building vocabulary Finished.') word_matrix = datahelper.wordlist_to_matrix("../txt/embedding_300d.bin", TEXT.vocab.itos, device, embedding_dim) train_iter = data.BucketIterator( dataset=train, batch_size=batch_size, sort_key=lambda x: len(x.Text1) + len(x.Text2), shuffle=True, device=device, repeat=False) valid_iter = data.Iterator(dataset=valid, batch_size=batch_size, device=device, shuffle=False, repeat=False)
LABEL = data.Field(sequential=False, batch_first=True, use_vocab=False) train = data.TabularDataset(path='./train.tsv', format='tsv', fields=[('Id', ID), ('Text1', TEXT), ('Text2', TEXT), ('Label', LABEL)]) test = data.TabularDataset(path=infile, format='tsv', fields=[('Id', ID), ('Text1', TEXT), ('Text2', TEXT)]) TEXT.build_vocab(train, min_freq=3) print('Building vocabulary Finished.') word_matrix = datahelper.wordlist_to_matrix("./data/embedding_300d.txt", TEXT.vocab.itos, device, embedding_dim) test_iter = data.Iterator(dataset=test, batch_size=batch_size, device=device, shuffle=False, repeat=False) test_dl = datahelper.BatchWrapper(test_iter, ["Id", "Text1", "Text2"]) MODEL = LSTM_angel(len(TEXT.vocab), embedding_dim, hidden_dim, batch_size, word_matrix, bidirectional=bidirectional)