def test_seq_tensor(self): tensorizer = SeqTokenTensorizer() data = TSVDataSource( train_file=SafeFileWrapper( tests_module.test_file("train_seq_features.tsv") ), test_file=None, eval_file=None, field_names=["text_seq"], schema={"text_seq": List[str]}, ) self._initialize_tensorizer(tensorizer, data) # UNK + PAD + 6 tokens self.assertEqual(8, len(tensorizer.vocab)) # only one row in test file: # ["where do you wanna meet?", "MPK"] for row in data.train: tokens, token_lens, seq_lens = tensorizer.prepare_input(row) idx, sentence_lens, lens = tensorizer.numberize(row) self.assertEqual(2, lens) self.assertEqual([[2, 3, 4, 5, 6], [7, 1, 1, 1, 1]], idx) self.assertEqual([5, 1], sentence_lens) self.assertEqual(2, seq_lens) self.assertEqual( [ ["where", "do", "you", "wanna", "meet?"], ["mpk", "__PAD__", "__PAD__", "__PAD__", "__PAD__"], ], tokens, )
def test_seq_tensor_with_bos_eos_eol_bol(self): tensorizer = SeqTokenTensorizer( add_bos_token=True, add_eos_token=True, add_bol_token=True, add_eol_token=True, ) data = TSVDataSource( train_file=SafeFileWrapper( tests_module.test_file("train_seq_features.tsv") ), test_file=None, eval_file=None, field_names=["text_seq"], schema={"text_seq": List[str]}, ) self._initialize_tensorizer(tensorizer, data) # UNK + PAD + BOS + EOS + BOL + EOL + 6 tokens self.assertEqual(12, len(tensorizer.vocab)) # only one row in test file: # ["where do you wanna meet?", "MPK"] for row in data.train: idx, sen_lens, lens = tensorizer.numberize(row) tokens, token_lens, seq_lens = tensorizer.prepare_input(row) self.assertEqual(4, lens) self.assertEqual(4, seq_lens) self.assertEqual([3, 7, 3, 3], token_lens) self.assertEqual( [ [2, 4, 3, 1, 1, 1, 1], [2, 6, 7, 8, 9, 10, 3], [2, 11, 3, 1, 1, 1, 1], [2, 5, 3, 1, 1, 1, 1], ], idx, ) self.assertEqual( [ [ "__BEGIN_OF_SENTENCE__", "__BEGIN_OF_LIST__", "__END_OF_SENTENCE__", "__PAD__", "__PAD__", "__PAD__", "__PAD__", ], [ "__BEGIN_OF_SENTENCE__", "where", "do", "you", "wanna", "meet?", "__END_OF_SENTENCE__", ], [ "__BEGIN_OF_SENTENCE__", "mpk", "__END_OF_SENTENCE__", "__PAD__", "__PAD__", "__PAD__", "__PAD__", ], [ "__BEGIN_OF_SENTENCE__", "__END_OF_LIST__", "__END_OF_SENTENCE__", "__PAD__", "__PAD__", "__PAD__", "__PAD__", ], ], tokens, )