def test_save_load_custom_head(self): model_name = "bert-base-uncased" model_config = AutoConfig.from_pretrained( model_name, custom_heads={"tag": CustomHead}) model1 = AutoModelWithHeads.from_pretrained(model_name, config=model_config) model2 = AutoModelWithHeads.from_pretrained(model_name, config=model_config) config = { "head_type": "tag", "num_labels": 3, "layers": 2, "activation_function": "tanh" } model1.add_custom_head("custom_head", config) with tempfile.TemporaryDirectory() as temp_dir: model1.save_head(temp_dir, "custom_head") model2.load_head(temp_dir) model1.eval() model2.eval() in_data = ids_tensor((1, 128), 1000) output1 = model1(in_data) output2 = model2(in_data) self.assertEqual(output1[0].size(), output2[0].size()) state1 = model1.config.prediction_heads["custom_head"].state_dict() state2 = model2.config.prediction_heads["custom_head"].state_dict() for ((k1, v1), (k2, v2)) in zip(state1.items(), state2.items()): self.assertTrue(torch.equal(v1, v2))
def test_load_full_model(self): model = AutoModelWithHeads.from_config(self.config()) model.add_classification_head("dummy", layers=1) true_config = model.get_prediction_heads_config() with tempfile.TemporaryDirectory() as temp_dir: # save model.save_pretrained(temp_dir) # reload model = AutoModelWithHeads.from_pretrained(temp_dir) self.assertIn("dummy", model.heads) self.assertDictEqual(true_config, model.get_prediction_heads_config())
def test_loading_adapter_weights_without_prefix(self): model_base, model_with_head_base = create_twin_models( AutoModel, self.config) model_with_head = AutoModelWithHeads.from_config( model_with_head_base.config) setattr(model_with_head, model_with_head.base_model_prefix, model_with_head_base) model_base.add_adapter("dummy") with tempfile.TemporaryDirectory() as temp_dir: model_base.save_adapter(temp_dir, "dummy") loading_info = {} model_with_head.load_adapter(temp_dir, loading_info=loading_info) self.assertEqual(0, len(loading_info["missing_keys"])) self.assertEqual(0, len(loading_info["unexpected_keys"])) # check equal output input_ids = self.get_input_samples((1, 128), config=model_with_head.config) output1 = model_with_head(input_ids) output2 = model_base(input_ids) self.assertEqual(len(output1), len(output2)) self.assertTrue(torch.equal(output1[0], output2[0]))
def test_train_single_adapter(self): tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelWithHeads.from_config(self.config()) # add two adapters: one will be trained and the other should be frozen model.add_adapter("mrpc") model.add_adapter("dummy") model.add_classification_head("mrpc") self.assertIn("mrpc", model.config.adapters.adapters) self.assertIn("dummy", model.config.adapters.adapters) # train the mrpc adapter -> should be activated & unfreezed model.train_adapter("mrpc") self.assertEqual(set(["mrpc"]), model.active_adapters.flatten()) # all weights of the adapter should be activated for k, v in filter_parameters(model, "adapters.mrpc.").items(): self.assertTrue(v.requires_grad, k) # all weights of the adapter not used for training should be freezed for k, v in filter_parameters(model, "adapters.dummy.").items(): self.assertFalse(v.requires_grad, k) # weights of the model should be freezed (check on some examples) for k, v in filter_parameters(model, "encoder.layer.0.attention").items(): self.assertFalse(v.requires_grad, k) state_dict_pre = copy.deepcopy(model.state_dict()) # setup dataset data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") training_args = TrainingArguments(output_dir="./examples", do_train=True, learning_rate=0.1, max_steps=7, no_cuda=True) # evaluate trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, ) trainer.train() for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()): if "mrpc" in k1: self.assertFalse(torch.equal(v1, v2)) else: self.assertTrue(torch.equal(v1, v2))
def test_train_adapter_fusion(self): tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelWithHeads.from_config(self.config()) self.add_head(model, "head") # add the adapters to be fused model.add_adapter("a") model.add_adapter("b") model.add_adapter("c") self.assertIn("a", model.config.adapters.adapters) self.assertIn("b", model.config.adapters.adapters) self.assertIn("c", model.config.adapters.adapters) # setup fusion adapter_setup = Fuse("a", "b", "c") model.add_adapter_fusion(adapter_setup) model.train_adapter_fusion(adapter_setup) model.set_active_adapters(adapter_setup) self.assertEqual(adapter_setup, model.active_adapters) # all weights of the adapters should be frozen (test for one) for k, v in filter_parameters(model, "adapters.a.").items(): self.assertFalse(v.requires_grad, k) # all weights of the fusion layer should be activated for k, v in filter_parameters(model, "adapter_fusion_layer").items(): self.assertTrue(v.requires_grad, k) # weights of the model should be freezed (check on some examples) for k, v in filter_parameters(model, "encoder.layer.0.attention").items(): self.assertFalse(v.requires_grad, k) state_dict_pre = copy.deepcopy(model.state_dict()) # Since our config has a value matrix, make sure it is regularized. # We do this by patching the fusion regularization function. regularization_called = False orig_fusion_regularization_loss = model.base_model.get_fusion_regularization_loss def patched_fusion_reg_loss(): nonlocal regularization_called regularization_called = True return orig_fusion_regularization_loss() model.base_model.get_fusion_regularization_loss = patched_fusion_reg_loss self.trainings_run(model, tokenizer) for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()): if ("adapter_fusion_layer" in k1 or "classifier" in k1 or "classification_head" in k1 or "score" in k1 or "heads" in k1): self.assertFalse(torch.equal(v1, v2), k1) else: self.assertTrue(torch.equal(v1, v2), k1) self.assertTrue(regularization_called)
def test_parallel_inference_with_heads(self): model = AutoModelWithHeads.from_config(self.config()) model.add_adapter("a") model.add_adapter("b") model.add_classification_head("a", num_labels=2) model.add_classification_head("b", num_labels=3) model.eval() inputs = {} inputs["attention_mask"] = torch.randint(0, 2, size=(2, 128)) inputs["input_ids"] = self.get_input_samples((2, 128), config=model.config) # for reference, pass through single adapters model.active_adapters = "a" model.active_head = "a" outputs_a = model(**inputs) model.active_adapters = "b" model.active_head = "b" outputs_b = model(**inputs) model.active_adapters = Parallel("a", "b") # active_adapters should set parallel heads too self.assertEqual(model.active_head, ["a", "b"]) outputs = model(**inputs) self.assertEqual(len(outputs), 2) self.assertEqual(outputs[0][0].shape, (2, 2)) self.assertEqual(outputs[1][0].shape, (2, 3)) self.assertTrue(torch.allclose(outputs[0][0], outputs_a[0])) self.assertTrue(torch.allclose(outputs[1][0], outputs_b[0]))
def test_loading_adapter_weights_without_prefix(self): if self.config_class not in MODEL_WITH_HEADS_MAPPING: self.skipTest("Does not support flex heads.") model_base, model_with_head_base = create_twin_models( self.model_class, self.config) model_with_head = AutoModelWithHeads.from_config( model_with_head_base.config) setattr(model_with_head, model_with_head.base_model_prefix, model_with_head_base) model_base.add_adapter("dummy") with tempfile.TemporaryDirectory() as temp_dir: model_base.save_adapter(temp_dir, "dummy") loading_info = {} model_with_head.load_adapter(temp_dir, loading_info=loading_info) self.assertEqual(0, len(loading_info["missing_keys"])) self.assertEqual(0, len(loading_info["unexpected_keys"])) # check equal output input_data = self.get_input_samples((1, 128), config=model_with_head.config) output1 = model_with_head(**input_data) output2 = model_base(**input_data) self.assertEqual(len(output1), len(output2)) self.assertTrue(torch.equal(output1[0], output2[0]))
def _load_pipeline_instance(pipeline_class, adapter_id): adapter_info = get_adapter_info(adapter_id, source="hf") if adapter_info is None: raise ValueError(f"Adapter with id '{adapter_id}' not available.") tokenizer = AutoTokenizer.from_pretrained(adapter_info.model_name) model = AutoModelWithHeads.from_pretrained(adapter_info.model_name) model.load_adapter(adapter_id, source="hf", set_active=True) return pipeline_class(model=model, tokenizer=tokenizer)
def run_test(self, static_model, input_shape=None, label_dict=None): flex_model = AutoModelWithHeads.from_pretrained( None, config=self.config(), state_dict=static_model.state_dict()) static_model.eval() flex_model.eval() if (static_model.base_model.__class__ != flex_model.base_model.__class__ and not static_model.base_model == static_model): self.skipTest("Skipping as base model classes are different.") with tempfile.TemporaryDirectory() as temp_dir: static_model.save_head(temp_dir) loading_info = {} flex_model.load_head(temp_dir, load_as="test", loading_info=loading_info) self.assertEqual( 0, len(loading_info["missing_keys"]), "Missing keys: {}".format(", ".join(loading_info["missing_keys"]))) # We don't need to convert some of the weights, so remove them for the check unexpected_keys = loading_info["unexpected_keys"] if static_model._keys_to_ignore_on_load_missing is not None: for pat in static_model._keys_to_ignore_on_load_missing: unexpected_keys = [ k for k in unexpected_keys if re.search(pat, k) is None ] # HACK for bert-based models if isinstance(static_model, BertPreTrainedModel): unexpected_keys = [ k for k in unexpected_keys if "cls.predictions.bias" not in k ] elif isinstance(static_model, RobertaPreTrainedModel): unexpected_keys = [ k for k in unexpected_keys if "lm_head.bias" not in k ] self.assertEqual( 0, len(unexpected_keys), "Unexpected keys: {}".format(", ".join(unexpected_keys))) # adapter and head were loaded self.assertIn("test", flex_model.heads) # check equal output input_shape = input_shape or (self.batch_size, self.seq_length) in_data = self.get_input_samples(input_shape, config=flex_model.config) if label_dict: for k, v in label_dict.items(): in_data[k] = v output1 = static_model(**in_data) output2 = flex_model(**in_data) self.assertTrue(torch.allclose(output1.loss, output2.loss)) self.assertTrue(torch.allclose( output1[1], output2[1])) # it's not called "logits" for all classes
def test_custom_head_from_model_config(self): model_name = "bert-base-uncased" model_config = AutoConfig.from_pretrained(model_name, custom_heads={"tag": CustomHead}) model = AutoModelWithHeads.from_pretrained(model_name, config=model_config) config = {"head_type": "tag", "num_labels": 3, "layers": 2, "activation_function": "tanh"} model.add_custom_head("custom_head", config) model.eval() in_data = ids_tensor((1, 128), 1000) output1 = model(in_data) model.add_tagging_head("tagging_head", num_labels=3, layers=2) output2 = model(in_data) self.assertEqual(output1[0].size(), output2[0].size())
def test_delete_head(self): model = AutoModelWithHeads.from_config(self.config()) model.eval() name = "test_head" self.add_head(model, name) self.assertTrue(name in model.heads) self.assertTrue(name in model.config.prediction_heads) self.assertEqual(name, model.active_head) model.delete_head(name) self.assertFalse(name in model.heads) self.assertFalse(name in model.config.prediction_heads) self.assertNotEqual(name, model.active_head)
def test_batch_split_head(self): if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_classification_head"): self.skipTest("No classification head available") model = AutoModelWithHeads.from_config(self.config()) model.add_classification_head("a") model.add_classification_head("b") model.active_head = BatchSplit("a", "b", batch_sizes=[1, 2]) in_data = self.get_input_samples((3, 128), config=model.config) out = model(**in_data) self.assertEqual(2, len(out)) self.assertEqual((1, 2), out[0][0].shape) self.assertEqual((2, 2), out[1][0].shape)
def test_model_with_heads_tagging_head_labels(self): model = AutoModelWithHeads.from_pretrained(self.model_name, config=self.config) model.add_tagging_head("test_head", num_labels=len(self.labels), id2label=self.label_map) with TemporaryDirectory() as temp_dir: model.save_head(temp_dir, "test_head") model.load_head(temp_dir) # this is just loaded to test whether loading an adapter changes the label information model.load_adapter("sst-2", "text_task") self.assertEqual(self.labels, model.get_labels()) self.assertDictEqual(self.label_map, model.get_labels_dict())
def test_batch_split_adapter_head(self): model = AutoModelWithHeads.from_config(self.config()) self.add_head(model, "a") self.add_head(model, "b") model.add_adapter("a") model.add_adapter("b") model.add_adapter("c") model.set_active_adapters( BatchSplit(Stack("c", "a"), "b", batch_sizes=[2, 1])) in_data = self.get_input_samples((3, 128), config=model.config) out = model(**in_data) self.assertEqual(2, len(out)) self.assertTrue(isinstance(model.active_head, BatchSplit))
def test_model_with_heads_multiple_heads(self): model = AutoModelWithHeads.from_pretrained(self.model_name, config=self.config) model.add_tagging_head("test_head", num_labels=len(self.labels), id2label=self.label_map) model.add_classification_head("second_head", num_labels=5) with TemporaryDirectory() as temp_dir: model.save_head(temp_dir + "/test_head", "test_head") model.load_head(temp_dir + "/test_head") model.save_head(temp_dir + "/second_head", "second_head") model.load_head(temp_dir + "/second_head") model.load_adapter("sst-2", "text_task") self.assertEqual(model.get_labels("test_head"), self.labels) self.assertEqual(model.get_labels_dict("test_head"), self.label_map)
def test_general(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True ) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") model = AutoModelWithHeads.from_pretrained("bert-base-uncased") model.add_classification_head("task", num_labels=3) # add the adapters to be fused model.add_adapter("task") model.add_adapter("additional_adapter") model.train_adapter("task") self.assertEqual("task", model.active_head) self.assertEqual(Stack("task"), model.active_adapters) with TemporaryDirectory() as tempdir: training_args = TrainingArguments( output_dir=tempdir, do_train=True, learning_rate=0.1, logging_steps=1, max_steps=1, save_steps=1, remove_unused_columns=False, ) trainer = AdapterTrainer( model=model, args=training_args, train_dataset=train_dataset, ) trainer.train() # Check that adapters are actually saved but the full model is not files_dir_checkpoint = [file_or_dir for file_or_dir in os.listdir(os.path.join(tempdir, "checkpoint-1"))] self.assertTrue("task" in files_dir_checkpoint) self.assertTrue("additional_adapter" in files_dir_checkpoint) # Check that full model weights are not stored self.assertFalse("pytorch_model.bin" in files_dir_checkpoint) # this should always be false in the adapter trainer self.assertFalse(trainer.args.remove_unused_columns) self.assertEqual("task", model.active_head) self.assertEqual(Stack("task"), model.active_adapters)
def test_invertible_adapter_with_head(self): if hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_masked_lm_head"): lm_head = "masked_lm" elif hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_causal_lm_head"): lm_head = "casual_lm" elif hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_seq2seq_lm_head"): lm_head = "seq2seq_lm" else: self.skipTest("No masked or causel language model head") model = AutoModelWithHeads.from_config(self.config()) model.add_adapter("test", config="pfeiffer+inv") if lm_head == "casual_lm": model.add_causal_lm_head("test") elif lm_head == "masked_lm": model.add_masked_lm_head("test") elif lm_head == "seq2seq_lm": model.add_seq2seq_lm_head("test") else: raise RuntimeError("{} is not a valid lm head".format(lm_head)) model.set_active_adapters("test") # Set a hook before the invertible adapter to make sure it's actually called twice: # Once after the embedding layer and once in the prediction head. calls = 0 def forward_pre_hook(module, input): nonlocal calls calls += 1 inv_adapter = model.base_model.get_invertible_adapter() self.assertIsNotNone(inv_adapter) inv_adapter.register_forward_pre_hook(forward_pre_hook) in_data = self.get_input_samples((self.batch_size, self.seq_length), config=model.config) out = model(**in_data) self.assertEqual( (self.batch_size, self.seq_length, model.config.vocab_size), out[0].shape) self.assertEqual(2, calls)
def test_multiple_heads_label(self): model = AutoModelWithHeads.from_pretrained(self.model_name, config=self.config) model.add_tagging_head("test_head", num_labels=len(self.labels), id2label=self.label_map) with TemporaryDirectory() as temp_dir: model.save_head(temp_dir, "test_head") model.load_head(temp_dir) # adapter loaded for testing whether it changes label information model.load_adapter("sst-2", "text_task") model.add_classification_head("classification_head") default_label, default_label_dict = get_default(2) self.assertEqual(model.get_labels("classification_head"), default_label) self.assertEqual(model.get_labels_dict("classification_head"), default_label_dict)
def test_parallel_inference_with_wrong_number_of_heads(self): model = AutoModelWithHeads.from_config(self.config()) model.eval() model.add_adapter("a") model.add_adapter("b") self.add_head(model, "a", num_labels=2) inputs = self.get_input_samples((2, 128), config=model.config) model.active_adapters = Parallel("a", "b") model.active_head = ["a"] with self.assertRaises(ValueError): model(**inputs) model.active_head = "a" with self.assertRaises(ValueError): model(**inputs)
def test_parallel_training(self): tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelWithHeads.from_config(self.config()) model.add_adapter("mrpc1") model.add_adapter("mrpc2") self.add_head(model, "mrpc1", num_labels=2) self.add_head(model, "mrpc2", num_labels=3) model.active_adapters = Parallel("mrpc1", "mrpc2") model.train_adapter(Parallel("mrpc1", "mrpc2")) # model.eval() # all weights of the adapter should be activated for k, v in filter_parameters(model, "adapters.mrpc1.").items(): self.assertTrue(v.requires_grad, k) # all weights of the adapter not used for training should be freezed for k, v in filter_parameters(model, "adapters.mrpc2.").items(): self.assertTrue(v.requires_grad, k) # weights of the model should be freezed (check on some examples) for k, v in filter_parameters(model, "encoder.layer.0.attention").items(): self.assertFalse(v.requires_grad, k) state_dict_pre = copy.deepcopy(model.state_dict()) train_dataset = self.dataset(tokenizer) training_args = TrainingArguments( output_dir="./examples", do_train=True, learning_rate=0.1, max_steps=10, no_cuda=True ) # evaluate trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, ) trainer.train() for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()): if "mrpc" in k1: self.assertFalse(torch.equal(v1, v2), k1) else: self.assertTrue(torch.equal(v1, v2))
def test_train_single_adapter(self): tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelWithHeads.from_config(self.config()) # add two adapters: one will be trained and the other should be frozen model.add_adapter("mrpc") model.add_adapter("dummy") self.add_head(model, "mrpc") self.assertIn("mrpc", model.config.adapters.adapters) self.assertIn("dummy", model.config.adapters.adapters) # train the mrpc adapter -> should be activated & unfreezed model.train_adapter("mrpc") self.assertEqual(set(["mrpc"]), model.active_adapters.flatten()) # all weights of the adapter should be activated for k, v in filter_parameters(model, "adapters.mrpc.").items(): self.assertTrue(v.requires_grad, k) # all weights of the adapter not used for training should be freezed for k, v in filter_parameters(model, "adapters.dummy.").items(): self.assertFalse(v.requires_grad, k) # weights of the model should be freezed (check on some examples) for k, v in filter_parameters(model, "encoder.layer.0.attention").items(): self.assertFalse(v.requires_grad, k) state_dict_pre = copy.deepcopy(model.state_dict()) self.trainings_run(model, tokenizer) for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()): if "mrpc" in k1: self.assertFalse(torch.equal(v1, v2)) else: self.assertTrue(torch.equal(v1, v2))
def test_reload_static_to_flex_head(self): if not hasattr(MODEL_WITH_HEADS_MAPPING[self.config_class], "add_classification_head"): self.skipTest("No classification head available") static_head_model = AutoModelForSequenceClassification.from_config( self.config()) flex_head_model = AutoModelWithHeads.from_pretrained( None, config=self.config(), state_dict=static_head_model.state_dict()) static_head_model.eval() flex_head_model.eval() static_head_model.add_adapter("test") with tempfile.TemporaryDirectory() as temp_dir: static_head_model.save_adapter(temp_dir, "test") loading_info = {} flex_head_model.load_adapter(temp_dir, loading_info=loading_info) # Load the adapter a second time to make sure our conversion script doesn't break anything flex_head_model.load_adapter(temp_dir, loading_info=loading_info) self.assertEqual(0, len(loading_info["missing_keys"])) self.assertEqual(0, len(loading_info["unexpected_keys"])) # adapter and head were loaded self.assertIn("test", flex_head_model.config.adapters) self.assertIn("test", flex_head_model.heads) # check equal output in_data = self.get_input_samples((1, 128), config=flex_head_model.config) output1 = static_head_model(**in_data, adapter_names=["test"]) output2 = flex_head_model(**in_data, adapter_names=["test"], head="test") self.assertTrue( torch.all(torch.isclose(output1.logits, output2.logits)))
def test_batch_split_with_heads(self): model = AutoModelWithHeads.from_config(self.config()) model.add_adapter("a") model.add_adapter("b") self.add_head(model, "a", num_labels=2) self.add_head(model, "b", num_labels=3) model.eval() inputs = {"input_ids": self.get_input_samples((2, 128), config=model.config)["input_ids"]} if isinstance(model, T5ModelWithHeads): inputs["decoder_input_ids"] = inputs["input_ids"] # for reference, pass through single adapters model.active_adapters = "a" model.active_head = "a" outputs_a = model(**{k: v[:1] for k, v in inputs.items()}) model.active_adapters = "b" model.active_head = "b" outputs_b = model(**{k: v[1:] for k, v in inputs.items()}) model.set_active_adapters(BatchSplit("a", "b", batch_sizes=[1, 1])) output = model(**inputs) self.assertEqual(2, len(output)) self.assertTrue( torch.allclose( output[0]["logits"], outputs_a["logits"], atol=1e-05, ) ) self.assertTrue( torch.allclose( output[1]["logits"], outputs_b["logits"], atol=1e-05, ) )
def test_parallel_training_single_forward_pass(self): model = AutoModelWithHeads.from_config(self.config()) model.eval() a1, a2 = self.create_twin_adapters(model, "a") b1, b2 = self.create_twin_adapters(model, "b") state_dict = model.state_dict() for k, v in state_dict.items(): if a1 in k: self.assertTrue(torch.equal(v, state_dict[k.replace(a1, a2)])) if b1 in k: self.assertTrue(torch.equal(v, state_dict[k.replace(b1, b2)])) input_data = self.get_input_samples((3, 128), config=model.config) if isinstance(model, T5ModelWithHeads): input_data["labels"] = torch.randint(0, 2, (3, 128)) else: input_data["labels"] = torch.randint(0, 2, (3, 1)) outputs = [] for adapter in [a1, b1]: model.active_head = adapter model.set_active_adapters(adapter) model.train_adapter(adapter) model.eval() outputs.append(model(**input_data)) model.set_active_adapters(Parallel(a2, b2)) model.train_adapter((Parallel(a2, b2))) model.eval() parallel_outputs = model(**input_data) for out1, out2 in zip(outputs, parallel_outputs.head_outputs): self.assertTrue(torch.allclose(out1["loss"], out2["loss"])) self.assertTrue(torch.allclose(out1["logits"], out2["logits"], atol=1e-5))
def test_parallel_training_equivalent_to_single_adapters(self): model = AutoModelWithHeads.from_config(self.config()) model.eval() a1, a2 = self.create_twin_adapters(model, "a") b1, b2 = self.create_twin_adapters(model, "b") dataset = [] for i in range(3): input_data = self.get_input_samples((3, 128), config=model.config) if isinstance(model, T5ModelWithHeads): input_data["labels"] = torch.randint(0, 2, (3, 128)) else: input_data["labels"] = torch.randint(0, 2, (3, 1)) dataset.append(input_data) for adapter in [a1, b1]: model.active_head = adapter model.set_active_adapters(adapter) model.train_adapter(adapter) model.eval() model = self.train_model(model, dataset) model.set_active_adapters(Parallel(a2, b2)) model.train_adapter((Parallel(a2, b2))) model.eval() model = self.train_model(model, dataset) state_dict = model.state_dict() for k, v in state_dict.items(): if a1 in k: self.assertTrue(torch.allclose(v, state_dict[k.replace(a1, a2)], atol=1e-5)) if b1 in k: self.assertTrue(torch.allclose(v, state_dict[k.replace(b1, b2)], atol=1e-5))
def test_batch_split_training(self): tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=False) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelWithHeads.from_config(self.config()) model.add_adapter("mrpc1") model.add_adapter("mrpc2") self.add_head(model, "mrpc1") self.add_head(model, "mrpc2") adapter_setup = BatchSplit("mrpc1", "mrpc2", batch_sizes=[1, 1]) model.active_adapters = adapter_setup model.train_adapter(adapter_setup) # all weights of the adapter should be activated for k, v in filter_parameters(model, "adapters.mrpc1.").items(): self.assertTrue(v.requires_grad, k) # all weights of the adapter not used for training should be freezed for k, v in filter_parameters(model, "adapters.mrpc2.").items(): self.assertTrue(v.requires_grad, k) # weights of the model should be freezed (check on some examples) for k, v in filter_parameters(model, "encoder.layer.0.attention").items(): self.assertFalse(v.requires_grad, k) state_dict_pre = copy.deepcopy(model.state_dict()) self.trainings_run(model, tokenizer) for ((k1, v1), (k2, v2)) in zip(state_dict_pre.items(), model.state_dict().items()): if "mrpc" in k1: self.assertFalse(torch.equal(v1, v2)) else: self.assertTrue(torch.equal(v1, v2))
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args, adapter_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named # label if at least two columns are provided. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.task_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset("glue", data_args.task_name) elif data_args.train_file.endswith(".csv"): # Loading a dataset from local csv files datasets = load_dataset("csv", data_files={ "train": data_args.train_file, "validation": data_args.validation_file }) else: # Loading a dataset from local json files datasets = load_dataset("json", data_files={ "train": data_args.train_file, "validation": data_args.validation_file }) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels label_list = None if data_args.task_name is not None: is_regression = data_args.task_name == "stsb" if not is_regression: label_list = datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your needs. is_regression = datasets["train"].features["label"].dtype in [ "float32", "float64" ] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, ) # We use the AutoModelWithHeads class here for better adapter support. model = AutoModelWithHeads.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) model.add_classification_head( data_args.task_name or "glue", num_labels=num_labels, id2label={i: v for i, v in enumerate(label_list)} if num_labels > 0 else None, ) # Setup adapters if adapter_args.train_adapter: task_name = data_args.task_name or "glue" # check if adapter already exists, otherwise add it if task_name not in model.config.adapters: # resolve the adapter config adapter_config = AdapterConfig.load( adapter_args.adapter_config, non_linearity=adapter_args.adapter_non_linearity, reduction_factor=adapter_args.adapter_reduction_factor, ) # load a pre-trained from Hub if specified if adapter_args.load_adapter: model.load_adapter( adapter_args.load_adapter, config=adapter_config, load_as=task_name, ) # otherwise, add a fresh adapter else: model.add_adapter(task_name, config=adapter_config) # optionally load a pre-trained language adapter if adapter_args.load_lang_adapter: # resolve the language adapter config lang_adapter_config = AdapterConfig.load( adapter_args.lang_adapter_config, non_linearity=adapter_args.lang_adapter_non_linearity, reduction_factor=adapter_args.lang_adapter_reduction_factor, ) # load the language adapter from Hub lang_adapter_name = model.load_adapter( adapter_args.load_lang_adapter, config=lang_adapter_config, load_as=adapter_args.language, ) else: lang_adapter_name = None # Freeze all model weights except of those of this adapter model.train_adapter([task_name]) # Set the adapters to be used in every forward pass if lang_adapter_name: model.set_active_adapters([lang_adapter_name, task_name]) else: model.set_active_adapters([task_name]) else: if adapter_args.load_adapter or adapter_args.load_lang_adapter: raise ValueError( "Adapters can only be loaded in adapters training mode." "Use --train_adapter to enable adapter training") # Preprocessing the datasets if data_args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[data_args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. non_label_column_names = [ name for name in datasets["train"].column_names if name != "label" ] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None # Padding strategy if data_args.pad_to_max_length: padding = "max_length" max_length = data_args.max_seq_length else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False max_length = None # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if (model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and data_args.task_name is not None and is_regression): # Some have all caps in their config, some don't. label_name_to_id = { k.lower(): v for k, v in model.config.label2id.items() } if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = { i: label_name_to_id[label_list[i]] for i in range(num_labels) } else: logger.warn( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) elif data_args.task_name is None: label_to_id = {v: i for i, v in enumerate(label_list)} def preprocess_function(examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*args, padding=padding, max_length=max_length, truncation=True) # Map labels to IDs (not necessary for GLUE tasks) if label_to_id is not None and "label" in examples: result["label"] = [label_to_id[l] for l in examples["label"]] return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) train_dataset = datasets["train"] eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] if data_args.task_name is not None: test_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # Get the metric function if data_args.task_name is not None: metric = load_metric("glue", data_args.task_name) # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from # compute_metrics # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) if data_args.task_name is not None: result = metric.compute(predictions=preds, references=p.label_ids) if len(result) > 1: result["combined_score"] = np.mean(list( result.values())).item() return result elif is_regression: return {"mse": ((preds - p.label_ids)**2).mean().item()} else: return { "accuracy": (preds == p.label_ids).astype(np.float32).mean().item() } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. data_collator=default_data_collator if data_args.pad_to_max_length else None, do_save_full_model=not adapter_args.train_adapter, do_save_adapters=adapter_args.train_adapter, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] eval_datasets = [eval_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") eval_datasets.append(datasets["validation_mismatched"]) for eval_dataset, task in zip(eval_datasets, tasks): eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join(training_args.output_dir, f"eval_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info(f"***** Eval results {task} *****") for key, value in eval_result.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") eval_results.update(eval_result) if training_args.do_predict: logger.info("*** Test ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] test_datasets = [test_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") test_datasets.append(datasets["test_mismatched"]) for test_dataset, task in zip(test_datasets, tasks): # Removing the `label` columns because it contains -1 and Trainer won't like that. test_dataset.remove_columns_("label") predictions = trainer.predict( test_dataset=test_dataset).predictions predictions = np.squeeze( predictions) if is_regression else np.argmax(predictions, axis=1) output_test_file = os.path.join(training_args.output_dir, f"test_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_test_file, "w") as writer: logger.info(f"***** Test results {task} *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = label_list[item] writer.write(f"{index}\t{item}\n") return eval_results
def test_reloading_prediction_head(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") data_args = GlueDataTrainingArguments( task_name="mrpc", data_dir="./tests/fixtures/tests_samples/MRPC", overwrite_cache=True ) train_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="train") model = AutoModelWithHeads.from_pretrained("bert-base-uncased") model.add_classification_head("adapter", num_labels=3) model.add_classification_head("dummy", num_labels=2) # add the adapters to be fused model.add_adapter("adapter") model.add_adapter("additional_adapter") # setup fusion adapter_setup = Fuse("adapter", "additional_adapter") model.add_adapter_fusion(adapter_setup) model.train_adapter_fusion(adapter_setup) model.set_active_adapters(adapter_setup) self.assertEqual(adapter_setup, model.active_adapters) self.assertEqual("dummy", model.active_head) with TemporaryDirectory() as tempdir: training_args = TrainingArguments( output_dir=tempdir, do_train=True, learning_rate=0.1, logging_steps=1, max_steps=1, save_steps=1, remove_unused_columns=False, ) trainer = AdapterTrainer( model=model, args=training_args, train_dataset=train_dataset, ) trainer.train() # create second model that should resume the training of the first model_resume = AutoModelWithHeads.from_pretrained("bert-base-uncased") model_resume.add_classification_head("adapter", num_labels=3) model_resume.add_classification_head("dummy", num_labels=2) model_resume.add_adapter("adapter") model_resume.add_adapter("additional_adapter") # setup fusion adapter_setup = Fuse("adapter", "additional_adapter") model_resume.add_adapter_fusion(adapter_setup) model_resume.train_adapter_fusion(adapter_setup) model_resume.set_active_adapters(adapter_setup) trainer_resume = AdapterTrainer( model=model_resume, args=TrainingArguments(do_train=True, max_steps=1, output_dir=tempdir), train_dataset=train_dataset, ) trainer_resume.train(resume_from_checkpoint=True) self.assertEqual("dummy", model.active_head) self.assertEqual(model.config.adapters.adapters, model_resume.config.adapters.adapters) for ((k1, v1), (k2, v2)) in zip( trainer.model.state_dict().items(), trainer_resume.model.state_dict().items() ): self.assertEqual(k1, k2) if "adapter" in k1 or "dummy" in k1: self.assertTrue(torch.equal(v1, v2), k1)
for split in ['train', 'val', 'test']: d = {"text":dataset_dict[f'{split}_text'], 'labels':dataset_dict[f'{split}_labels']} if split == 'val': split = 'validation' #name mismatch with xlm-t dataset and library datasets dataset[split] = Dataset.from_dict(d) # --- MODEL --- config = AutoConfig.from_pretrained( MODEL, num_labels=NUM_LABELS, ) model = AutoModelWithHeads.from_pretrained( MODEL, config=config, ) # Add a new adapter adapter_name = f"adapter_{UNIQUE_NAME}" #adapter_name = f"xlm-t-sentiment" model.add_adapter(adapter_name, AdapterType.text_task) # Add a matching classification head model.add_classification_head( adapter_name, num_labels=NUM_LABELS, id2label={ 0: "Neg", 1:"Neu", 2:"Pos"} ) # Activate the adapter
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args, adapter_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: num_labels = glue_tasks_num_labels[data_args.task_name] output_mode = glue_output_modes[data_args.task_name] except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelWithHeads.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) model.add_classification_head(data_args.task_name, num_labels=num_labels) # Setup adapters if adapter_args.train_adapter: task_name = data_args.task_name # check if adapter already exists, otherwise add it if task_name not in model.config.adapters.adapter_list( AdapterType.text_task): # resolve the adapter config adapter_config = AdapterConfig.load( adapter_args.adapter_config, non_linearity=adapter_args.adapter_non_linearity, reduction_factor=adapter_args.adapter_reduction_factor, ) # load a pre-trained from Hub if specified if adapter_args.load_adapter: model.load_adapter( adapter_args.load_adapter, AdapterType.text_task, config=adapter_config, load_as=task_name, ) # otherwise, add a fresh adapter else: model.add_adapter(task_name, AdapterType.text_task, config=adapter_config) # optionally load a pre-trained language adapter if adapter_args.load_lang_adapter: # resolve the language adapter config lang_adapter_config = AdapterConfig.load( adapter_args.lang_adapter_config, non_linearity=adapter_args.lang_adapter_non_linearity, reduction_factor=adapter_args.lang_adapter_reduction_factor, ) # load the language adapter from Hub lang_adapter_name = model.load_adapter( adapter_args.load_lang_adapter, AdapterType.text_lang, config=lang_adapter_config, load_as=adapter_args.language, ) else: lang_adapter_name = None # Freeze all model weights except of those of this adapter model.train_adapter([task_name]) # Set the adapters to be used in every forward pass if lang_adapter_name: model.set_active_adapters([lang_adapter_name, task_name]) else: model.set_active_adapters([task_name]) # Get datasets train_dataset = GlueDataset( data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode="dev") if training_args.do_eval else None test_dataset = GlueDataset( data_args, tokenizer=tokenizer, mode="test") if training_args.do_predict else None def compute_metrics(p: EvalPrediction) -> Dict: if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(data_args.task_name, preds, p.label_ids) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, do_save_full_model=not adapter_args.train_adapter, do_save_adapters=adapter_args.train_adapter, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") eval_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="dev")) for eval_dataset in eval_datasets: eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format( eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info("*** Test ***") test_datasets = [test_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") test_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="test")) for test_dataset in test_datasets: predictions = trainer.predict( test_dataset=test_dataset).predictions if output_mode == "classification": predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join( training_args.output_dir, f"test_results_{test_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format( test_dataset.args.task_name)) writer.write("index\tprediction\n") for index, item in enumerate(predictions): if output_mode == "regression": writer.write("%d\t%3.3f\n" % (index, item)) else: item = test_dataset.get_labels()[item] writer.write("%d\t%s\n" % (index, item)) return eval_results