def get_tensorizers(add_dict_feat=False, add_contextual_feat=False): schema = {"source_sequence": str, "dict_feat": Gazetteer, "target_sequence": str} data_source = TSVDataSource.from_config( TSVDataSource.Config( train_filename=TEST_FILE_NAME, field_names=["source_sequence", "dict_feat", "target_sequence"], ), schema, ) src_tensorizer = TokenTensorizer.from_config( TokenTensorizer.Config( column="source_sequence", add_eos_token=True, add_bos_token=True ) ) tgt_tensorizer = TokenTensorizer.from_config( TokenTensorizer.Config( column="target_sequence", add_eos_token=True, add_bos_token=True ) ) tensorizers = {"src_seq_tokens": src_tensorizer, "trg_seq_tokens": tgt_tensorizer} initialize_tensorizers(tensorizers, data_source.train) if add_dict_feat: tensorizers["dict_feat"] = GazetteerTensorizer.from_config( GazetteerTensorizer.Config( text_column="source_sequence", dict_column="dict_feat" ) ) initialize_tensorizers( {"dict_feat": tensorizers["dict_feat"]}, data_source.train ) return tensorizers
class ModelInput(Model.Config.ModelInput): pos_response: TokenTensorizer.Config = TokenTensorizer.Config( column="pos_response") neg_response: TokenTensorizer.Config = TokenTensorizer.Config( column="neg_response") query: TokenTensorizer.Config = TokenTensorizer.Config( column="query")
class ModelInput(BasePairwiseModel.Config.ModelInput): tokens1: TokenTensorizer.Config = TokenTensorizer.Config(column="text1") tokens2: TokenTensorizer.Config = TokenTensorizer.Config(column="text2") labels: LabelTensorizer.Config = LabelTensorizer.Config() # for metric reporter raw_text: JoinStringTensorizer.Config = JoinStringTensorizer.Config( columns=["text1", "text2"] )
def test_initialize_word_tensorizer(self): tensorizer = TokenTensorizer(text_column="text") init = tensorizer.initialize() init.send(None) # kick for row in self.data.train: init.send(row) init.close() self.assertEqual(49, len(tensorizer.vocab))
def test_numberize_with_token_tensorizer(self): tensorizer = TokenTensorizer(text_column="text") self._initialize_tensorizer(tensorizer) rows = [{"text": "I want some coffee"}, {"text": "Turn it up"}] tensors = (tensorizer.numberize(row) for row in rows) tokens, seq_len, token_ranges = next(tensors) self.assertEqual([24, 0, 0, 0], tokens) self.assertEqual(4, seq_len) self.assertEqual([(0, 1), (2, 6), (7, 11), (12, 18)], token_ranges) tokens, seq_len, token_ranges = next(tensors) self.assertEqual([13, 47, 9], tokens) self.assertEqual(3, seq_len) self.assertEqual([(0, 4), (5, 7), (8, 10)], token_ranges)
def test_batch_predict_caffe2_model(self): with tempfile.NamedTemporaryFile() as snapshot_file, tempfile.NamedTemporaryFile() as caffe2_model_file: train_data = tests_module.test_file("train_data_tiny.tsv") eval_data = tests_module.test_file("test_data_tiny.tsv") config = PyTextConfig( task=DocumentClassificationTask.Config( model=DocModel.Config( inputs=DocModel.Config.ModelInput( tokens=TokenTensorizer.Config(), dense=FloatListTensorizer.Config( column="dense", dim=1, error_check=True ), labels=LabelTensorizer.Config(), ) ), data=Data.Config( source=TSVDataSource.Config( train_filename=train_data, eval_filename=eval_data, test_filename=eval_data, field_names=["label", "slots", "text", "dense"], ) ), ), version=21, save_snapshot_path=snapshot_file.name, export_caffe2_path=caffe2_model_file.name, ) task = create_task(config.task) task.export(task.model, caffe2_model_file.name) model = task.model save(config, model, meta=None, tensorizers=task.data.tensorizers) pt_results = task.predict(task.data.data_source.test) def assert_caffe2_results_correct(caffe2_results): for pt_res, res in zip(pt_results, caffe2_results): np.testing.assert_array_almost_equal( pt_res["score"].tolist()[0], [score[0] for score in res.values()], ) results = batch_predict_caffe2_model( snapshot_file.name, caffe2_model_file.name ) self.assertEqual(4, len(results)) assert_caffe2_results_correct(results) results = batch_predict_caffe2_model( snapshot_file.name, caffe2_model_file.name, cache_size=2 ) self.assertEqual(4, len(results)) assert_caffe2_results_correct(results) results = batch_predict_caffe2_model( snapshot_file.name, caffe2_model_file.name, cache_size=-1 ) self.assertEqual(4, len(results)) assert_caffe2_results_correct(results)
def test_create_word_tensors(self): tensorizer = TokenTensorizer(text_column="text") init = tensorizer.initialize() init.send(None) # kick for row in self.data.train: init.send(row) init.close() rows = [{"text": "I want some coffee"}, {"text": "Turn it up"}] tensors = (tensorizer.numberize(row) for row in rows) tokens, seq_len = next(tensors) self.assertEqual([24, 0, 0, 0], tokens) self.assertEqual(4, seq_len) tokens, seq_len = next(tensors) self.assertEqual([13, 47, 9], tokens) self.assertEqual(3, seq_len)
class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() word_labels: SlotLabelTensorizer.Config = SlotLabelTensorizer.Config( allow_unknown=True) doc_labels: LabelTensorizer.Config = LabelTensorizer.Config( allow_unknown=True) doc_weight: Optional[FloatTensorizer.Config] = None word_weight: Optional[FloatTensorizer.Config] = None
def test_initialize_tensorizers(self): tensorizers = { "tokens": TokenTensorizer(text_column="text"), "labels": LabelTensorizer(label_column="label"), "chars": ByteTensorizer(text_column="text"), } initialize_tensorizers(tensorizers, self.data.train) self.assertEqual(49, len(tensorizers["tokens"].vocab)) self.assertEqual(7, len(tensorizers["labels"].vocab))
class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() word_labels: SlotLabelTensorizer.Config = SlotLabelTensorizer.Config( allow_unknown=True) doc_labels: LabelTensorizer.Config = LabelTensorizer.Config( allow_unknown=True) doc_weight: FloatTensorizer.Config = FloatTensorizer.Config( column="doc_weight") word_weight: FloatTensorizer.Config = FloatTensorizer.Config( column="word_weight")
def test_create_batches_different_tensorizers(self): tensorizers = {"tokens": TokenTensorizer(text_column="text")} data = Data(self.data_source, tensorizers, Batcher(train_batch_size=16)) batches = list(data.batches(Stage.TRAIN)) self.assertEqual(1, len(batches)) raw_batch, batch = next(iter(batches)) self.assertEqual({"tokens"}, set(batch)) tokens, seq_lens, _ = batch["tokens"] self.assertEqual((10, ), seq_lens.size()) self.assertEqual(10, len(tokens))
def test_data_initializes_tensorsizers(self): tensorizers = { "tokens": TokenTensorizer(text_column="text"), "labels": LabelTensorizer(label_column="label"), } # verify TokenTensorizer isn't in an initialized state yet assert tensorizers["tokens"].vocab is None Data(self.data_source, tensorizers) # Tensorizers should have been initialized self.assertEqual(49, len(tensorizers["tokens"].vocab)) self.assertEqual(7, len(tensorizers["labels"].vocab))
def test_initialize_token_tensorizer(self): # default (build from data) tensorizer = TokenTensorizer(text_column="text") self._initialize_tensorizer(tensorizer) self.assertEqual(49, len(tensorizer.vocab)) # size limit on tokens from data tensorizer = TokenTensorizer( text_column="text", vocab_config=VocabConfig(size_from_data=3) ) self._initialize_tensorizer(tensorizer) self.assertEqual(5, len(tensorizer.vocab)) # 3 + unk token + pad token embed_file = tests_module.test_file("pretrained_embed_raw") # vocab from data + vocab_file tensorizer = TokenTensorizer( text_column="text", vocab_config=VocabConfig( size_from_data=3, vocab_files=[ VocabFileConfig(filepath=embed_file, skip_header_line=True) ], ), ) self._initialize_tensorizer(tensorizer) self.assertEqual(15, len(tensorizer.vocab)) # vocab just from vocab_file tensorizer = TokenTensorizer( text_column="text", vocab_config=VocabConfig( build_from_data=False, vocab_files=[ VocabFileConfig( filepath=embed_file, skip_header_line=True, size_limit=5 ) ], ), ) init = tensorizer.initialize() # Should skip initialization with self.assertRaises(StopIteration): init.send(None) self.assertEqual(7, len(tensorizer.vocab)) # 5 + unk token + pad token
def from_config(cls, config: Config, **kwargs): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) vocab = None if isinstance(tokenizer, WordPieceTokenizer): print("Using WordPieceTokenizer") replacements = { "[UNK]": UNK, "[PAD]": PAD, "[CLS]": BOS, "[SEP]": EOS, "[MASK]": MASK, } vocab = Vocabulary( [token for token, _ in tokenizer.vocab.items()], replacements=replacements, ) doc_tensorizer = TokenTensorizer( text_column=config.doc_column, tokenizer=tokenizer, vocab=vocab, max_seq_len=config.max_doc_seq_len, ) ques_tensorizer = TokenTensorizer( text_column=config.ques_column, tokenizer=tokenizer, vocab=vocab, max_seq_len=config.max_ques_seq_len, ) return cls( doc_tensorizer=doc_tensorizer, ques_tensorizer=ques_tensorizer, doc_column=config.doc_column, ques_column=config.ques_column, answers_column=config.answers_column, answer_starts_column=config.answer_starts_column, tokenizer=tokenizer, vocab=vocab, **kwargs, )
def _get_tensorizers(self): schema = {"source_sequence": str, "target_sequence": str} data_source = TSVDataSource.from_config( TSVDataSource.Config( train_filename=tests_module.test_file( "compositional_seq2seq_unit.tsv"), field_names=["source_sequence", "target_sequence"], ), schema, ) src_tensorizer = TokenTensorizer.from_config( TokenTensorizer.Config(column="source_sequence", add_eos_token=True, add_bos_token=True)) tgt_tensorizer = TokenTensorizer.from_config( TokenTensorizer.Config(column="target_sequence", add_eos_token=True, add_bos_token=True)) tensorizers = { "src_seq_tokens": src_tensorizer, "trg_seq_tokens": tgt_tensorizer, } initialize_tensorizers(tensorizers, data_source.train) return tensorizers
def setUp(self): self.data_source = TSVDataSource( SafeFileWrapper( tests_module.test_file("train_dense_features_tiny.tsv")), SafeFileWrapper( tests_module.test_file("test_dense_features_tiny.tsv")), eval_file=None, field_names=["label", "slots", "text", "dense"], schema={ "text": str, "label": str }, ) self.tensorizers = { "tokens": TokenTensorizer(text_column="text"), "labels": LabelTensorizer(label_column="label", allow_unknown=True), }
class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() dense: Optional[FloatListTensorizer.Config] = None labels: LabelTensorizer.Config = LabelTensorizer.Config()
class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config( add_bos_token=True, add_eos_token=True)
class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config( add_bos_token=True, add_eos_token=True ) # for metric reporter raw_text: RawString.Config = RawString.Config(column="text")
class ModelInput(Model.Config.ModelInput): src_seq_tokens: TokenTensorizer.Config = TokenTensorizer.Config() trg_seq_tokens: TokenTensorizer.Config = TokenTensorizer.Config() dict_feat: Optional[GazetteerTensorizer.Config] = None contextual_token_embedding: Optional[ ByteTokenTensorizer.Config] = None
class ModelInput(Model.Config.ModelInput): tokens: Optional[TokenTensorizer.Config] = TokenTensorizer.Config( add_bos_token=True, add_eos_token=True)
class ModelInput(BasePairwiseModel.Config.ModelInput): tokens1: TokenTensorizer.Config = TokenTensorizer.Config( column="text1") tokens2: TokenTensorizer.Config = TokenTensorizer.Config( column="text2") labels: LabelTensorizer.Config = LabelTensorizer.Config()
class ModelInput(Model.Config.ModelInput): src_seq_tokens: TokenTensorizer.Config = TokenTensorizer.Config() trg_seq_tokens: TokenTensorizer.Config = TokenTensorizer.Config() dict_feat: Optional[GazetteerTensorizer.Config] = None
class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() labels: LabelTensorizer.Config = LabelTensorizer.Config()
class RegressionModelInput(DocModel.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() labels: NumericLabelTensorizer.Config = NumericLabelTensorizer.Config( )
class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() slots: TokenTensorizer.Config = TokenTensorizer.Config( column="slots")
class ModelInput(BaseModel.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config( column="tokenized_text") actions: AnnotationNumberizer.Config = AnnotationNumberizer.Config( )
class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() labels: WordLabelTensorizer.Config = WordLabelTensorizer.Config() # for metric reporter raw_text: RawString.Config = RawString.Config(column="text")
class ModelInput(Model.Config.ModelInput): tokens: TokenTensorizer.Config = TokenTensorizer.Config() labels: LabelTensorizer.Config = LabelTensorizer.Config(allow_unknown=True) # for metric reporter raw_text: RawString.Config = RawString.Config(column="text")