def test_create_byte_token_tensors(self): tensorizer = ByteTokenTensorizer( text_column="text", max_seq_len=4, max_byte_len=5 ) # not initializing because initializing is a no-op for this tensorizer s1 = "I want some coffee today" s2 = "Turn it up" def ords(word, pad_to): return list(word.encode()) + [0] * (pad_to - len(word)) batch = [{"text": s1}, {"text": s2}] # Note that the tokenizer lowercases here expected = [ [ords("i", 5), ords("want", 5), ords("some", 5), ords("coffe", 5)], [ords("turn", 5), ords("it", 5), ords("up", 5), ords("", 5)], ] expected_token_lens = [4, 3] expected_byte_lens = [[1, 4, 4, 5], [4, 2, 2, 0]] bytes, token_lens, byte_lens = tensorizer.tensorize( [tensorizer.numberize(row) for row in batch] ) self.assertIsInstance(bytes, torch.LongTensor) self.assertIsInstance(token_lens, torch.LongTensor) self.assertIsInstance(byte_lens, torch.LongTensor) self.assertEqual((2, 4, 5), bytes.size()) self.assertEqual((2,), token_lens.size()) self.assertEqual((2, 4), byte_lens.size()) self.assertEqual(expected, bytes.tolist()) self.assertEqual(expected_token_lens, token_lens.tolist()) self.assertEqual(expected_byte_lens, byte_lens.tolist())
def test_tokens_dictfeat_contextual(self): # TODO (T65593688): this should be removed after # https://github.com/pytorch/pytorch/pull/33645 is merged. with torch.no_grad(): model = Seq2SeqModel.from_config( Seq2SeqModel.Config( source_embedding=WordEmbedding.Config(embed_dim=512), target_embedding=WordEmbedding.Config(embed_dim=512), inputs=Seq2SeqModel.Config.ModelInput( dict_feat=GazetteerTensorizer.Config( text_column="source_sequence" ), contextual_token_embedding=ByteTokenTensorizer.Config(), ), encoder_decoder=RNNModel.Config( encoder=LSTMSequenceEncoder.Config(embed_dim=619) ), dict_embedding=DictEmbedding.Config(), contextual_token_embedding=ContextualTokenEmbedding.Config( embed_dim=7 ), ), get_tensorizers(add_dict_feat=True, add_contextual_feat=True), ) model.eval() ts_model = model.torchscriptify() res = ts_model( ["call", "mom"], (["call", "mom"], [0.42, 0.17], [4, 3]), [0.42] * (7 * 2), ) assert res is not None
def get_tensorizers(add_dict_feat=False, add_contextual_feat=False): schema = { "source_sequence": str, "dict_feat": Gazetteer, "target_sequence": str } data_source = TSVDataSource.from_config( TSVDataSource.Config( train_filename=TEST_FILE_NAME, field_names=["source_sequence", "dict_feat", "target_sequence"], ), schema, ) src_tensorizer = TokenTensorizer.from_config( TokenTensorizer.Config(column="source_sequence", add_eos_token=True, add_bos_token=True)) tgt_tensorizer = TokenTensorizer.from_config( TokenTensorizer.Config(column="target_sequence", add_eos_token=True, add_bos_token=True)) tensorizers = { "src_seq_tokens": src_tensorizer, "trg_seq_tokens": tgt_tensorizer } initialize_tensorizers(tensorizers, data_source.train) if add_dict_feat: tensorizers["dict_feat"] = GazetteerTensorizer.from_config( GazetteerTensorizer.Config(text_column="source_sequence", dict_column="dict_feat")) initialize_tensorizers({"dict_feat": tensorizers["dict_feat"]}, data_source.train) if add_contextual_feat: tensorizers[ "contextual_token_embedding"] = ByteTokenTensorizer.from_config( ByteTokenTensorizer.Config(column="source_sequence")) initialize_tensorizers( { "contextual_token_embedding": tensorizers["contextual_token_embedding"] }, data_source.train, ) return tensorizers
def test_tokens_contextual(self): model = Seq2SeqModel.from_config( Seq2SeqModel.Config( source_embedding=WordEmbedding.Config(embed_dim=512), target_embedding=WordEmbedding.Config(embed_dim=512), inputs=Seq2SeqModel.Config.ModelInput( contextual_token_embedding=ByteTokenTensorizer.Config()), contextual_token_embedding=ContextualTokenEmbedding.Config( embed_dim=7), encoder_decoder=RNNModel.Config( encoder=LSTMSequenceEncoder.Config(embed_dim=519)), ), get_tensorizers(add_contextual_feat=True), ) model.eval() ts_model = model.torchscriptify() res = ts_model(["call", "mom"], contextual_token_embedding=[0.42] * (7 * 2)) assert res is not None
def test_tokens_dictfeat_contextual(self): model = Seq2SeqModel.from_config( Seq2SeqModel.Config( source_embedding=WordEmbedding.Config(embed_dim=512), target_embedding=WordEmbedding.Config(embed_dim=512), inputs=Seq2SeqModel.Config.ModelInput( dict_feat=GazetteerTensorizer.Config( text_column="source_sequence"), contextual_token_embedding=ByteTokenTensorizer.Config(), ), encoder_decoder=RNNModel.Config( encoder=LSTMSequenceEncoder.Config(embed_dim=619)), dict_embedding=DictEmbedding.Config(), contextual_token_embedding=ContextualTokenEmbedding.Config( embed_dim=7), ), get_tensorizers(add_dict_feat=True, add_contextual_feat=True), ) model.eval() ts_model = model.torchscriptify() res = ts_model(["call", "mom"], (["call", "mom"], [0.42, 0.17], [4, 3]), [0.42] * (7 * 2)) assert res is not None
class ByteModelInput(DocModel.Config.ModelInput): token_bytes: ByteTokenTensorizer.Config = ByteTokenTensorizer.Config( )
class ByteModelInput(Model.Config.ModelInput): # We should support characters as well, but CharacterTokenTensorizer # does not support adding characters to vocab yet. token_bytes: ByteTokenTensorizer.Config = ByteTokenTensorizer.Config( ) labels: SlotLabelTensorizer.Config = SlotLabelTensorizer.Config()
class ByteModelInput(WordTaggingModel.Config.ModelInput): # We should support characters as well, but CharacterTokenTensorizer # does not support adding characters to vocab yet. tokens: ByteTokenTensorizer.Config = ByteTokenTensorizer.Config()