def test_caching_with_lazy_reader_in_multi_process_loader(self): data_file = (AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "imdb_corpus.jsonl") reader = TextClassificationJsonReader( lazy=True, cache_directory=self.cache_directory) deque(DataLoader(reader.read(data_file), collate_fn=lambda b: b[0], num_workers=2), maxlen=0) # We shouldn't write to the cache when the data is being loaded from multiple # processes. cache_file = reader._get_cache_location_for_file_path(str(data_file)) assert not os.path.exists(cache_file) # But try again from the main process and we should see the cache file. instances = list(reader.read(data_file)) assert instances assert os.path.exists(cache_file) # Reading again from a multi-process loader should read from the cache. new_instances = list( DataLoader(reader.read(data_file), collate_fn=lambda b: b[0], num_workers=2)) assert len(instances) == len(new_instances)
def test_batch_count(self): dataset = AllennlpDataset(self.instances, vocab=self.vocab) sampler = BucketBatchSampler(dataset, batch_size=2, padding_noise=0, sorting_keys=["text"]) # We use a custom collate_fn for testing, which doesn't actually create tensors, # just the allennlp Batches. dataloader = DataLoader(dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x)) assert len(dataloader) == 3
def test_multi_processing_with_lazy_dataset_warns(): def fake_instance_generator(file_name: str) -> Iterable[Instance]: yield from [] with pytest.warns(UserWarning, match=r".*deadlocks.*"): DataLoader(AllennlpLazyDataset(fake_instance_generator, "nonexistent_file"), num_workers=1)
def test_batch_of_entirely_empty_lists_works(self): dataset = AllennlpDataset([self.empty_instance, self.empty_instance], self.vocab) model = DummyModel(self.vocab) model.eval() loader = DataLoader(dataset, batch_size=2) batch = next(iter(loader)) model.forward(**batch)
def test_max_instances_with_multi_process_loader(self, num_workers): data_file = (AllenNlpTestCase.FIXTURES_ROOT / "data" / "text_classification_json" / "imdb_corpus.jsonl") reader = TextClassificationJsonReader(max_instances=2, lazy=True) instances = list( DataLoader(reader.read(data_file), collate_fn=lambda b: b[0], num_workers=num_workers)) assert len(instances) == 2
def test_elmo_bilm(self): # get the raw data sentences, expected_lm_embeddings = self._load_sentences_embeddings() # load the test model elmo_bilm = _ElmoBiLm(self.options_file, self.weight_file) # Deal with the data. indexer = ELMoTokenCharactersIndexer() # For each sentence, first create a TextField, then create an instance instances = [] for batch in zip(*sentences): for sentence in batch: tokens = [Token(token) for token in sentence.split()] field = TextField(tokens, {"character_ids": indexer}) instance = Instance({"elmo": field}) instances.append(instance) vocab = Vocabulary() dataset = AllennlpDataset(instances, vocab) # Now finally we can iterate through batches. loader = DataLoader(dataset, 3) for i, batch in enumerate(loader): lm_embeddings = elmo_bilm( batch["elmo"]["character_ids"]["elmo_tokens"]) top_layer_embeddings, mask = remove_sentence_boundaries( lm_embeddings["activations"][2], lm_embeddings["mask"]) # check the mask lengths lengths = mask.data.numpy().sum(axis=1) batch_sentences = [sentences[k][i] for k in range(3)] expected_lengths = [ len(sentence.split()) for sentence in batch_sentences ] self.assertEqual(lengths.tolist(), expected_lengths) # get the expected embeddings and compare! expected_top_layer = [ expected_lm_embeddings[k][i] for k in range(3) ] for k in range(3): self.assertTrue( numpy.allclose( top_layer_embeddings[k, :lengths[k], :].data.numpy(), expected_top_layer[k], atol=1.0e-6, ))
def test_drop_last_works(self): dataset = AllennlpDataset(self.instances, vocab=self.vocab) sampler = BucketBatchSampler( dataset, batch_size=2, padding_noise=0, sorting_keys=["text"], drop_last=True, ) # We use a custom collate_fn for testing, which doesn't actually create tensors, # just the allennlp Batches. dataloader = DataLoader(dataset, batch_sampler=sampler, collate_fn=lambda x: Batch(x)) batches = [batch for batch in iter(dataloader)] stats = self.get_batches_stats(batches) # all batches have length batch_size assert all(batch_len == 2 for batch_len in stats["batch_lengths"]) # we should have lost one instance by skipping the last batch assert stats["total_instances"] == len(self.instances) - 1
def get_accuracy(model, dev_dataset, vocab, trigger_token_ids=None, snli=False): """ When trigger_token_ids is None, gets accuracy on the dev_dataset. Otherwise, gets accuracy with triggers prepended for the whole dev_dataset. """ model.get_metrics(reset=True) model.eval() # model should be in eval() already, but just in case data_loader = DataLoader(dev_dataset, batch_sampler=BucketBatchSampler(dev_dataset, batch_size=128)) if trigger_token_ids is None: for batch in data_loader: evaluate_batch(model, batch, trigger_token_ids, snli) print("Without Triggers: " + str(model.get_metrics()['accuracy'])) else: print_string = "" for idx in trigger_token_ids: print_string = print_string + vocab.get_token_from_index(idx) + ', ' for batch in data_loader: evaluate_batch(model, batch, trigger_token_ids, snli) print("Current Triggers: " + print_string + " : " + str(model.get_metrics()['accuracy']))
def test_loader_uses_all_instances_when_batches_per_epochs_set(lazy): NUM_INSTANCES = 20 BATCH_SIZE = 2 BATCHES_PER_EPOCH = 3 EPOCHS = 4 class FakeDatasetReader(DatasetReader): def _read(self, filename: str) -> Iterable[Instance]: for i in range(NUM_INSTANCES): yield Instance({"index": LabelField(i, skip_indexing=True)}) reader = FakeDatasetReader(lazy=lazy) dataset = reader.read("blah") loader = DataLoader(dataset, batch_size=BATCH_SIZE, batches_per_epoch=BATCHES_PER_EPOCH) epoch_batches = [] for epoch in range(EPOCHS): batches = [] for batch in loader: instances = [] for index in batch["index"]: instances.append(index) batches.append(instances) epoch_batches.append(batches) assert epoch_batches == [ # Epoch 0. [[0, 1], [2, 3], [4, 5]], # Epoch 1. [[6, 7], [8, 9], [10, 11]], # Epoch 2. [[12, 13], [14, 15], [16, 17]], # Epoch 3. [[18, 19], [0, 1], [2, 3]], ]
def fine_tune_model(model: Model, params: Params, serialization_dir: str, extend_vocab: bool = False, file_friendly_logging: bool = False, batch_weight_key: str = "", embedding_sources_mapping: Dict[str, str] = None, in_fold = None, num_folds = None, ewc_weight=None) -> Model: """ Fine tunes the given model, using a set of parameters that is largely identical to those used for :func:`~allennlp.commands.train.train_model`, except that the ``model`` section is ignored, if it is present (as we are already given a ``Model`` here). The main difference between the logic done here and the logic done in ``train_model`` is that here we do not worry about vocabulary construction or creating the model object. Everything else is the same. Parameters ---------- model : ``Model`` A model to fine tune. params : ``Params`` A parameter object specifying an AllenNLP Experiment serialization_dir : ``str`` The directory in which to save results and logs. extend_vocab: ``bool``, optional (default=False) If ``True``, we use the new instances to extend your vocabulary. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we add newlines to tqdm output, even on an interactive terminal, and we slow down tqdm's output to only once every 10 seconds. batch_weight_key : ``str``, optional (default="") If non-empty, name of metric used to weight the loss on a per-batch basis. embedding_sources_mapping: ``Dict[str, str]``, optional (default=None) mapping from model paths to the pretrained embedding filepaths used during fine-tuning. """ prepare_environment(params) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError(f"Serialization directory ({serialization_dir}) " f"already exists and is not empty.") os.makedirs(serialization_dir, exist_ok=True) prepare_global_logging(serialization_dir, file_friendly_logging) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) if params.pop('model', None): logger.warning("You passed parameters for the model in your configuration file, but we " "are ignoring them, using instead the model parameters in the archive.") vocabulary_params = params.pop('vocabulary', {}) if vocabulary_params.get('directory_path', None): logger.warning("You passed `directory_path` in parameters for the vocabulary in " "your configuration file, but it will be ignored. ") all_datasets = datasets_from_params(params) vocab = model.vocab if extend_vocab: datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("Extending model vocabulary using %s data.", ", ".join(datasets_for_vocab_creation)) vocab.extend_from_instances(vocabulary_params, (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation)) model.extend_embedder_vocab(embedding_sources_mapping) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') dl_params = params.pop("data_loader") if test_data is not None: rand = random.Random(1234) test_data.index_with(vocab) shuffled_test = copy(test_data.instances) rand.shuffle(shuffled_test) extra_test = shuffled_test[:2000] keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": AllennlpDataset(extra_test, vocab)}) extra_test_loader = DataLoader.from_params(params.pop("test_data_loader", keys)) keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": test_data}) test_loader = DataLoader.from_params(params.pop("test_data_loader", keys)) master_model = model global_metrics = {} training_metrics = [] final_metrics = {} master_trainer = trainer_params.as_dict() if num_folds is not None: rand = random.Random(1234) fold_train = [] fold_test = [] fold_train_loader = [] fold_test_loader = [] shuffled_instances = copy(train_data.instances) rand.shuffle(shuffled_instances) kfold = KFold(n_splits=num_folds, random_state=None, shuffle=False) computed_folds = list(kfold.split(shuffled_instances)) for fold in range(num_folds): train_indexes, test_indexes = computed_folds[fold] new_train = [shuffled_instances[i] for i in train_indexes] new_test = [shuffled_instances[i] for i in test_indexes] fold_train.append(AllennlpDataset(new_train, vocab=vocab)) fold_test.append(AllennlpDataset(new_test, vocab=vocab)) keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": fold_test[-1]}) fold_test_loader.append(DataLoader.from_params(params.pop("fold_test_data_loader",keys))) keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": fold_train[-1]}) fold_train_loader.append(DataLoader.from_params(params.pop("fold_train_data_loader", keys))) for fold in ([in_fold] if in_fold is not None else range(num_folds)): fold_model = deepcopy(master_model) eval_epoch_callback = EvalEpochCallback(fold, fold_test_loader[fold], test_loader, global_metrics) callbacks = [eval_epoch_callback] if ewc_weight is not None: ewc = EWC(extra_test_loader) def ewc_forward(*args, **kwargs) -> Dict[str, torch.Tensor]: ewc_loss = 0 if ewc.model.training: ewc_loss = ewc.penalty(ewc.model) ret = ewc.model.old_forward(*args, **kwargs) ret["loss"] += ewc_weight * ewc_loss return ret fold_model.old_forward = fold_model.forward fold_model.forward = ewc_forward callbacks.append(CallLossCallback(ewc)) trainer = Trainer.from_params(model=fold_model, serialization_dir=serialization_dir, data_loader=fold_train_loader[fold], train_data=train_data, validation_data=None, params=Params(deepcopy(master_trainer)), validation_data_loader=None, epoch_callbacks=callbacks) training_metrics.append(trainer.train()) del fold_model del trainer del eval_epoch_callback state = glob(serialization_dir+"/*.th") for file in state: logger.info("deleting state - {}".format(file)) os.unlink(file) else: callbacks = [] if ewc_weight is not None: ewc = EWC(extra_test_loader) def ewc_forward(*args, **kwargs) -> Dict[str, torch.Tensor]: ewc_loss = 0 if ewc.model.training: ewc_loss = ewc.penalty(ewc.model) ret = ewc.model.old_forward(*args, **kwargs) ret["loss"] += ewc_weight * ewc_loss return ret model.old_forward = model.forward model.forward = ewc_forward callbacks.append(CallLossCallback(ewc)) keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": train_data}) train_data.index_with(vocab) train_data_loader = DataLoader.from_params(params.pop("train_loader",keys)) if validation_data is not None: validation_data.index_with(vocab) keys = deepcopy(dl_params.as_dict()) keys.update({"dataset": validation_data}) validation_data_loader = DataLoader.from_params(params.pop("validation_loader", keys)) else: validation_data_loader = None if "finetune" in dir(model): model.finetune() logger.info("Fine tuning model") trainer = Trainer.from_params(model=model, serialization_dir=serialization_dir, data_loader=train_data_loader, train_data=train_data, validation_data=None, params=Params(deepcopy(master_trainer)), validation_data_loader=validation_data_loader, epoch_callbacks=callbacks) training_metrics = trainer.train() archive_model(serialization_dir) final_metrics["fine_tune"] = global_metrics final_metrics["training"] = training_metrics metrics_json = json.dumps(final_metrics, indent=2) with open(os.path.join(serialization_dir, "metrics.json"), "w") as metrics_file: metrics_file.write(metrics_json) logger.info("Metrics: %s", metrics_json) return model
def run(all_code_types, d_embedding, embedding_dropout_p, min_count, batch_size, verbose, epochs, lr, wd, logsig, sig_depth, run_name, patience, add_time, leadlag, t_scale, t_max, use_timestamps, feedforward_num_layers, feedforward_hidden_dims, feedforward_activations, feedforward_dropout, training_proportion=1, testing_subsample_size=None, split_paths=False, tensorboard_log=False, evaluate_on_test=True): """Run the experiment for either cross validation or testing""" dataset, dataset_test, vocab = generate_ml_data( all_code_types, min_count, batch_size, verbose=verbose, allen_mode=True, dataset_path=None, training_proportion=training_proportion, testing_subsample_size=testing_subsample_size, split_paths=split_paths) logger.info("Using k-fold cross validation") # Allen kfold metrics_by_fold = [] cross_validator = StratifiedKFold(n_splits=K_FOLDS, shuffle=True) n_splits = cross_validator.get_n_splits(dataset) for fold_index, (train_indices, validation_indices) in enumerate( cross_validator(dataset)): logger.info(f"Fold {fold_index}/{n_splits - 1}") train_dataset = Subset( dataset, train_indices, ) validation_dataset = Subset(dataset, validation_indices) train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) validation_loader = DataLoader(dataset=validation_dataset, batch_size=batch_size, shuffle=True) if tensorboard_log or evaluate_on_test: serialization_dir = os.path.join(TENSORBOARD_DIR, run_name, str(uuid.uuid4()), str(fold_index)) else: serialization_dir = None model = init_sig(vocab, d_embedding, embedding_dropout_p, sig_depth, logsig, all_code_types, feedforward_num_layers, feedforward_hidden_dims, feedforward_activations, feedforward_dropout, leadlag, add_time, t_max, t_scale, use_timestamps, split_paths) if torch.cuda.is_available(): cuda_device = 0 model = model.cuda(cuda_device) logger.info('USING CUDA GPU') else: cuda_device = -1 fold_metrics, model = train_model(model, lr, wd, train_loader, validation_loader, patience, epochs, cuda_device, serialization_dir) if serialization_dir is not None: ex.add_artifact( os.path.join(serialization_dir, 'best.th')) # Add file location to sacred log metrics_by_fold.append(fold_metrics) if evaluate_on_test: if serialization_dir is None: raise Exception( 'serialization_dir needed to load best model from validation' ) test_dataloader = DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=True) # Held out test data metrics = evaluate(model, test_dataloader, cuda_device) return metrics torch.cuda.empty_cache() metrics = reformat_metrics(metrics_by_fold, ex) return metrics
def main(): # load the binary SST dataset. single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences. reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class", token_indexers={"tokens": single_id_indexer}, use_subtrees=True) train_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/train.txt') reader = StanfordSentimentTreeBankDatasetReader(granularity="2-class", token_indexers={"tokens": single_id_indexer}) dev_data = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/sst/dev.txt') # test_dataset = reader.read('data/sst/test.txt') vocab = Vocabulary.from_instances(train_data) train_data.index_with(vocab) dev_data.index_with(vocab) # Randomly initialize vectors if EMBEDDING_TYPE == "None": token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300) word_embedding_dim = 300 # Load word2vec vectors elif EMBEDDING_TYPE == "w2v": embedding_path = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip" weight = _read_pretrained_embeddings_file(embedding_path, embedding_dim=300, vocab=vocab, namespace="tokens") token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=300, weight=weight, trainable=False) word_embedding_dim = 300 # Initialize model, cuda(), and optimizer word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) encoder = PytorchSeq2VecWrapper(torch.nn.LSTM(word_embedding_dim, hidden_size=512, num_layers=2, batch_first=True)) model = LstmClassifier(word_embeddings, encoder, vocab) model.cuda() # where to save the model model_path = "/tmp/" + EMBEDDING_TYPE + "_" + "model.th" vocab_path = "/tmp/" + EMBEDDING_TYPE + "_" + "vocab" # if the model already exists (its been trained), load the pre-trained weights and vocabulary if os.path.isfile(model_path): vocab = Vocabulary.from_files(vocab_path) model = LstmClassifier(word_embeddings, encoder, vocab) with open(model_path, 'rb') as f: model.load_state_dict(torch.load(f)) # otherwise train model from scratch and save its weights else: train_sampler = BucketBatchSampler(train_data, batch_size=32, sorting_keys=[("tokens")]) dev_sampler = BucketBatchSampler(dev_data, batch_size=32, sorting_keys=[("tokens")]) train_loader = DataLoader(train_data, batch_sampler=train_sampler) dev_loader = DataLoader(dev_data, batch_sampler=dev_sampler) optimizer = optim.Adam(model.parameters()) trainer = GradientDescentTrainer(model=model, optimizer=optimizer, data_loader=train_loader, validation_data_loader=dev_loader, num_epochs=5, patience=1, cuda_device=0) trainer.train() with open(model_path, 'wb') as f: torch.save(model.state_dict(), f) vocab.save_to_files(vocab_path) model.train().cuda() # rnn cannot do backwards in eval mode # Register a gradient hook on the embeddings. This saves the gradient w.r.t. the word embeddings. # We use the gradient later in the attack. utils.add_hooks(model) embedding_weight = utils.get_embedding_weight(model) # also save the word embedding matrix # Build k-d Tree if you are using gradient + nearest neighbor attack # tree = KDTree(embedding_weight.numpy()) # filter the dataset to only positive or negative examples # (the trigger will cause the opposite prediction) dataset_label_filter = "0" targeted_dev_data = [] for instance in dev_data: if instance['label'].label == dataset_label_filter: targeted_dev_data.append(instance) targeted_dev_data = AllennlpDataset(targeted_dev_data, vocab) # get accuracy before adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids=None) model.train() # rnn cannot do backwards in eval mode # initialize triggers which are concatenated to the input num_trigger_tokens = 3 trigger_token_ids = [vocab.get_token_index("the")] * num_trigger_tokens # Use batches of size universal_perturb_batch_size for the attacks. universal_perturb_batch_size = 128 targeted_sampler = BasicBatchSampler(sampler=SequentialSampler(targeted_dev_data), batch_size=universal_perturb_batch_size, drop_last=False) # TODO don't drop last targeted_loader = DataLoader(targeted_dev_data, batch_sampler=targeted_sampler) # sample batches, update the triggers, and repeat for epoch in range(5): for batch in targeted_loader: # get accuracy with current triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids) model.train() # rnn cannot do backwards in eval mode # get gradient w.r.t. trigger embeddings for current batch averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids) # pass the gradients to a particular attack to generate token candidates for each token. cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad, embedding_weight, trigger_token_ids, num_candidates=40, increase_loss=True) # cand_trigger_token_ids = attacks.random_attack(embedding_weight, # trigger_token_ids, # num_candidates=40) # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad, # embedding_weight, # trigger_token_ids, # tree, # 100, # num_candidates=40, # increase_loss=True) # Tries all of the candidates and returns the trigger sequence with highest loss. trigger_token_ids = utils.get_best_candidates(model, batch, trigger_token_ids, cand_trigger_token_ids) # print accuracy after adding triggers utils.get_accuracy(model, targeted_dev_data, vocab, trigger_token_ids)
def test_multi_processing_with_lazy_dataset_warns(): with pytest.warns(UserWarning, match=r".*deadlocks.*"): DataLoader(AllennlpLazyDataset(fake_instance_generator, "nonexistent_file"), num_workers=1)
def evaluate_model(self): self.from_pretrained() print(evaluate(self.model, DataLoader(self.dev_data, 32), 0, None))
def train(self): if self.config.adjust_point: ram_set_flag("adjust_point") # ram_write('dist_reg', self.config.dist_reg) read_hyper_ = partial(read_hyper, self.config.task_id, self.config.arch) num_epochs = int(read_hyper_("num_epochs")) batch_size = int(read_hyper_("batch_size")) logger.info(f"num_epochs: {num_epochs}, batch_size: {batch_size}") if self.config.model_name == 'tmp': p = pathlib.Path('saved/models/tmp') if p.exists(): shutil.rmtree(p) # Maybe we will do some data augmentation here. if self.config.aug_data != '': log(f'Augment data from {self.config.aug_data}') aug_data = auto_create( f"{self.config.task_id}.{self.config.arch}.aug", lambda: self.reader.read(self.config.aug_data), cache=True) self.train_data.instances.extend(aug_data.instances) # Set up the adversarial training policy if self.config.arch == 'bert': model_vocab = embed_util.get_bert_vocab() else: model_vocab = self.vocab # yapf: disable adv_field = 'sent2' if is_sentence_pair(self.config.task_id) and self.config.arch != 'bert' else 'sent' policy_args = { "adv_iteration": self.config.adv_iter, "replace_num": self.config.adv_replace_num, "searcher": WordIndexSearcher( CachedWordSearcher( "external_data/ibp-nbrs.json" if not self.config.big_nbrs else "external_data/euc-top8.json", model_vocab.get_token_to_index_vocabulary("tokens"), second_order=False ), word2idx=model_vocab.get_token_index, idx2word=model_vocab.get_token_from_index, ), 'adv_field': adv_field } # yapf: enable if self.config.adv_policy == 'hot': if is_sentence_pair( self.config.task_id) and self.config.arch != 'bert': policy_args['forward_order'] = 1 adv_policy = adv_utils.HotFlipPolicy(**policy_args) elif self.config.adv_policy == 'rdm': adv_policy = adv_utils.RandomNeighbourPolicy(**policy_args) elif self.config.adv_policy == 'diy': adv_policy = adv_utils.DoItYourselfPolicy(self.config.adv_iter, adv_field, self.config.adv_step) else: adv_policy = adv_utils.NoPolicy # A collate_fn will do some transformation an instance before # fed into a model. If we want to train a model with some transformations # such as cropping/DAE, we can modify code here. e.g., # collate_fn = partial(transform_collate, self.vocab, self.reader, Crop(0.3)) collate_fn = allennlp_collate train_data_sampler = BucketBatchSampler( data_source=self.train_data, batch_size=batch_size, ) # Set callbacks if self.config.task_id == 'SNLI' and self.config.arch != 'bert': epoch_callbacks = [] if self.config.model_pretrain != "": epoch_callbacks = [WarmupCallback(2)] if self.config.model_pretrain == 'auto': self.config.model_pretrain = { "biboe": "SNLI-fix-biboe-sum", "datt": "SNLI-fix-datt" }[self.config.arch] logger.warning( f"Try loading weights from pretrained model {self.config.model_pretrain}" ) pretrain_ckpter = CheckpointerX( f"saved/models/{self.config.model_pretrain}") self.model.load_state_dict(pretrain_ckpter.best_model_state()) else: epoch_callbacks = [] # epoch_callbacks = [] batch_callbacks = [] opt = self.model.get_optimizer() if self.config.arch == 'bert': scl = SlantedTriangular(opt, num_epochs, len(self.train_data) // batch_size) else: scl = None trainer = AdvTrainer( model=self.model, optimizer=opt, learning_rate_scheduler=scl, validation_metric='+accuracy', adv_policy=adv_policy, data_loader=DataLoader( self.train_data, batch_sampler=train_data_sampler, collate_fn=collate_fn, ), validation_data_loader=DataLoader( self.dev_data, batch_size=batch_size, ), num_epochs=num_epochs, patience=None, grad_clipping=1., cuda_device=0, epoch_callbacks=epoch_callbacks, batch_callbacks=batch_callbacks, serialization_dir=f'saved/models/{self.config.model_name}', num_serialized_models_to_keep=20) trainer.train()