def test_from_params_in_trainer(self): # This is more of an integration test, making sure that a bunch of pieces fit together # correctly, but it matters most for this learning rate scheduler, so we're testing it here. params = Params( { "num_epochs": 5, "learning_rate_scheduler": { "type": "slanted_triangular", "gradual_unfreezing": True, "discriminative_fine_tuning": True, "decay_factor": 0.5, }, } ) # The method called in the logic below only checks the length of this list, not its # contents, so this should be safe. instances = AllennlpDataset([1] * 40) optim = self._get_optimizer() trainer = Trainer.from_params( model=self.model, optimizer=Lazy(lambda **kwargs: optim), serialization_dir=self.TEST_DIR, params=params, data_loader=DataLoader(instances, batch_size=10), ) assert isinstance(trainer._learning_rate_scheduler, SlantedTriangular) # This is what we wrote this test for: to be sure that num_epochs is passed correctly, and # that num_steps_per_epoch is computed and passed correctly. This logic happens inside of # `Trainer.from_partial_objects`. assert trainer._learning_rate_scheduler.num_epochs == 5 assert trainer._learning_rate_scheduler.num_steps_per_epoch == 4 # And we'll do one more to make sure that we can override num_epochs in the scheduler if we # really want to. Not sure why you would ever want to in this case; this is just testing # the functionality. params = Params( { "num_epochs": 5, "learning_rate_scheduler": { "type": "slanted_triangular", "num_epochs": 3, "gradual_unfreezing": True, "discriminative_fine_tuning": True, "decay_factor": 0.5, }, } ) trainer = Trainer.from_params( model=self.model, optimizer=Lazy(lambda **kwargs: optim), serialization_dir=self.TEST_DIR, params=params, data_loader=DataLoader(instances, batch_size=10), ) assert trainer._learning_rate_scheduler.num_epochs == 3
def run_config(config): params = Params(json.loads(config)) params_copy = params.duplicate() if 'dataset_reader' in params: reader = DatasetReader.from_params(params.pop('dataset_reader')) else: raise RuntimeError('`dataset_reader` section is required') all_instances = [] if 'train_data_path' in params: print('Reading the training data...') train_data = reader.read(params.pop('train_data_path')) all_instances.extend(train_data) else: raise RuntimeError('`train_data_path` section is required') validation_data = None if 'validation_data_path' in params: print('Reading the validation data...') validation_data = reader.read(params.pop('validation_data_path')) all_instances.extend(validation_data) print('Building the vocabulary...') vocab = Vocabulary.from_instances(all_instances) model = None iterator = None if 'model' not in params: # 'dataset' mode — just preview the (first 10) instances print('Showing the first 10 instances:') for inst in all_instances[:10]: print(inst) else: model = Model.from_params(vocab=vocab, params=params.pop('model')) loader_params = deepcopy(params.pop("data_loader")) train_data_loader = DataLoader.from_params(dataset=train_data, params=loader_params) dev_data_loader = DataLoader.from_params(dataset=validation_data, params=loader_params) iterator.index_with(vocab) # set up a temporary, empty directory for serialization with tempfile.TemporaryDirectory() as serialization_dir: trainer = Trainer.from_params( model=model, serialization_dir=serialization_dir, data_loader=train_data_loader, validation_data_loader=dev_data_loader, params=params.pop('trainer')) trainer.train() return { 'params': params_copy, 'dataset_reader': reader, 'vocab': vocab, 'iterator': iterator, 'model': model }
def create_trainer_for_finding_lr( pipeline: Pipeline, trainer_config: TrainerConfiguration, training_data: InstancesDataset, ) -> GradientDescentTrainer: """Returns an AllenNLP Trainer used for the learning rate scan. Parameters ---------- pipeline The pipeline with the model trainer_config A trainer configuration training_data The training data """ prepare_environment(Params({})) if hasattr(training_data, "index_with"): training_data.index_with(pipeline.backbone.vocab) trainer_params = Params( helpers.sanitize_for_params(trainer_config.to_allennlp_trainer())) training_data_loader = create_dataloader(training_data, trainer_config.batch_size, trainer_config.data_bucketing) return Trainer.from_params( model=pipeline._model, data_loader=training_data_loader, params=trainer_params, serialization_dir=None, )
def test_reduce_on_plateau_and_metric_agree(self): # pylint: disable=protected-access for metric in ["+acc", "-loss"]: trainer_params = Params({ "validation_metric": metric, "learning_rate_scheduler": { "type": "reduce_on_plateau" }, "optimizer": { "type": "adam", "lr": 0.01 } }) trainer = Trainer.from_params(model=self.model, serialization_dir=self.TEST_DIR, iterator=self.iterator, train_data=self.instances, validation_data=self.instances, params=trainer_params) if metric[0] == "+": correct_mode = "max" assert trainer._learning_rate_scheduler.lr_scheduler.mode == correct_mode else: correct_mode = "min" assert trainer._learning_rate_scheduler.lr_scheduler.mode == correct_mode
def create_trainer_for_finding_lr( model: PipelineModel, trainer_config: TrainerConfiguration, training_data: InstancesDataset, ) -> GradientDescentTrainer: """Returns an AllenNLP Trainer used for the learning rate scan. Parameters ---------- model The underlying model trainer_config A trainer configuration training_data The training data """ prepare_environment(Params({})) trainer_params = Params( helpers.sanitize_for_params(trainer_config.to_allennlp_trainer())) training_data_loader = create_dataloader(training_data, trainer_config.batch_size, trainer_config.data_bucketing) return cast( "GradientDescentTrainer", Trainer.from_params( model=model, data_loader=training_data_loader, params=trainer_params, serialization_dir=None, ), )
def get_trainer_from_config( config: Params, train_instances: List[Instance], val_instances: List[Instance], device: int, serialization_dir: Optional[str] = None) -> Trainer: trainer_params = config.pop("trainer") trainer_params["cuda_device"] = device model_params = config.pop("model") vocab_dir = config.pop("vocab_dir", None) if vocab_dir is None: vocab = Vocabulary.from_instances(train_instances) else: vocab = Vocabulary.from_files(vocab_dir) model = Model.from_params(model_params, vocab=vocab) iterator = DataIterator.from_params(config.pop("iterator")) trainer_params["num_serialized_models_to_keep"] = 1 iterator.index_with(vocab) trainer = Trainer.from_params(model=model, iterator=iterator, train_data=train_instances, validation_data=val_instances, serialization_dir=serialization_dir, params=trainer_params) return trainer
def setup_method(self): super().setup_method() params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 }, }, "dataset_reader": { "type": "sequence_tagging" }, "train_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), "validation_data_path": str(self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv"), "data_loader": { "batch_size": 2 }, "trainer": { "cuda_device": -1, "num_epochs": 2, "optimizer": "adam" }, }) all_data_loaders = data_loaders_from_params(params) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), instances=(instance for data_loader in all_data_loaders.values() for instance in data_loader.iter_instances()), ) model = Model.from_params(vocab=vocab, params=params.pop("model")) data_loader = all_data_loaders["train"] data_loader.index_with(vocab) trainer_params = params.pop("trainer") serialization_dir = os.path.join(self.TEST_DIR, "test_search_learning_rate") self.trainer = Trainer.from_params( model=model, serialization_dir=serialization_dir, data_loader=data_loader, params=trainer_params, validation_data=None, validation_iterator=None, )
def main(args): params = Params.from_file(args.config_path) stdout_handler = prepare_global_logging(args.output_dir, False) prepare_environment(params) reader = DatasetReader.from_params(params["dataset_reader"]) train_dataset = reader.read(params.pop("train_data_path", None)) valid_dataset = reader.read(params.pop("validation_data_path", None)) test_data_path = params.pop("test_data_path", None) if test_data_path: test_dataset = reader.read(test_data_path) vocab = Vocabulary.from_instances(train_dataset + valid_dataset + test_dataset) else: test_dataset = None vocab = Vocabulary.from_instances(train_dataset + valid_dataset) model_params = params.pop("model", None) model = Model.from_params(model_params.duplicate(), vocab=vocab) vocab.save_to_files(os.path.join(args.output_dir, "vocabulary")) # copy config file with open(args.config_path, "r", encoding="utf-8") as f_in: with open(os.path.join(args.output_dir, "config.json"), "w", encoding="utf-8") as f_out: f_out.write(f_in.read()) iterator = DataIterator.from_params(params.pop("iterator", None)) iterator.index_with(vocab) trainer_params = params.pop("trainer", None) trainer = Trainer.from_params(model=model, serialization_dir=args.output_dir, iterator=iterator, train_data=train_dataset, validation_data=valid_dataset, params=trainer_params.duplicate()) trainer.train() # evaluate on the test set if test_dataset: logging.info("Evaluating on the test set") import torch # import here to ensure the republication of the experiment model.load_state_dict( torch.load(os.path.join(args.output_dir, "best.th"))) test_metrics = evaluate(model, test_dataset, iterator, cuda_device=trainer_params.pop( "cuda_device", 0), batch_weight_key=None) logging.info(f"Metrics on the test set: {test_metrics}") with open(os.path.join(args.output_dir, "test_metrics.txt"), "w", encoding="utf-8") as f_out: f_out.write(f"Metrics on the test set: {test_metrics}") cleanup_global_logging(stdout_handler)
def setUp(self): super().setUp() param_file = self.FIXTURES_ROOT / 'simple_tagger' / 'experiment_with_regularization.json' self.set_up_model(param_file, self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') params = Params.from_file(param_file) self.reader = DatasetReader.from_params(params['dataset_reader']) self.iterator = DataIterator.from_params(params['iterator']) self.trainer = Trainer.from_params(self.model, self.TEST_DIR, self.iterator, self.dataset, None, params.get('trainer'))
def setUp(self): super().setUp() param_file = self.FIXTURES_ROOT / "simple_tagger" / "experiment_with_regularization.json" self.set_up_model(param_file, self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") params = Params.from_file(param_file) self.reader = DatasetReader.from_params(params["dataset_reader"]) self.iterator = DataIterator.from_params(params["iterator"]) self.trainer = Trainer.from_params(self.model, self.TEST_DIR, self.iterator, self.dataset, None, params.get("trainer"))
def _setup(self): """Setup the trainer components and local resources""" prepare_environment( Params({} if self._trainer_config.random_seed is None else { "random_seed": self._trainer_config.random_seed, "numpy_seed": self._trainer_config.random_seed, "pytorch_seed": self._trainer_config.random_seed, })) os.makedirs(self._output_dir, exist_ok=True) # We don't need to load pretrained weights from saved models if self._pipeline.config.features.word: self._pipeline.config.features.word.weights_file = None serialization_params = sanitize(self._allennlp_configuration()) with open(os.path.join(self._output_dir, CONFIG_NAME), "w") as param_file: json.dump(serialization_params, param_file, indent=4) self._pipeline.save_vocabulary( os.path.join(self._output_dir, "vocabulary")) for dataset in [self._training, self._validation, self._test]: if dataset and hasattr(dataset, "index_with"): dataset.index_with(self._pipeline.backbone.vocab) trainer_params = Params( helpers.sanitize_for_params( self._trainer_config.to_allennlp_trainer())) pipeline_model = self._pipeline._model training_data_loader = create_dataloader( self._training, self._trainer_config.batch_size, self._trainer_config.data_bucketing, self._trainer_config.batches_per_epoch, ) validation_data_loader = (create_dataloader( self._validation, self._trainer_config.batch_size, self._trainer_config.data_bucketing, ) if self._validation else None) self._trainer = Trainer.from_params( model=pipeline_model, serialization_dir=self._output_dir, data_loader=training_data_loader, validation_data_loader=validation_data_loader, params=trainer_params, epoch_callbacks=self._epoch_callbacks, )
def run_config(config): params = Params(json.loads(config)) params_copy = params.duplicate() if "dataset_reader" in params: reader = DatasetReader.from_params(params.pop("dataset_reader")) else: raise RuntimeError("`dataset_reader` section is required") loader_params = params.pop("data_loader") train_data_loader = DataLoader.from_params( reader=reader, data_path=params.pop("train_data_path"), params=loader_params.duplicate(), ) dev_data_loader = DataLoader.from_params( reader=reader, data_path=params.pop("validation_data_path"), params=loader_params, ) print("Building the vocabulary...") vocab = Vocabulary.from_instances(train_data_loader.iter_instances()) if "model" not in params: # 'dataset' mode — just preview the (first 10) instances print("Showing the first 10 instances:") for inst in train_data_loader.iter_instances(): print(inst) return None model = Model.from_params(vocab=vocab, params=params.pop("model")) train_data_loader.index_with(vocab) dev_data_loader.index_with(vocab) # set up a temporary, empty directory for serialization with tempfile.TemporaryDirectory() as serialization_dir: trainer = Trainer.from_params( model=model, serialization_dir=serialization_dir, data_loader=train_data_loader, validation_data_loader=dev_data_loader, params=params.pop("trainer"), ) trainer.train() return { "params": params_copy, "dataset_reader": reader, "vocab": vocab, "model": model, }
def setUp(self): super().setUp() param_file = self.FIXTURES_ROOT / "simple_tagger" / "experiment_with_regularization.json" self.set_up_model(param_file, self.FIXTURES_ROOT / "data" / "sequence_tagging.tsv") params = Params.from_file(param_file) self.reader = DatasetReader.from_params(params["dataset_reader"]) self.data_loader = DataLoader.from_params(dataset=self.instances, params=params["data_loader"]) self.trainer = Trainer.from_params( model=self.model, data_loader=self.data_loader, serialization_dir=self.TEST_DIR, params=params.get("trainer"), )
def setUp(self): super().setUp() param_file = self.FIXTURES_ROOT / 'simple_tagger' / 'experiment_with_regularization.json' self.set_up_model(param_file, self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv') params = Params.from_file(param_file) self.reader = DatasetReader.from_params(params['dataset_reader']) self.iterator = DataIterator.from_params(params['iterator']) self.trainer = Trainer.from_params( self.model, self.TEST_DIR, self.iterator, self.dataset, None, params.get('trainer') )
def get_trainer_from_config(config: Params, train_instances: List[Instance], val_instances: List[Instance], vocab: Optional[Vocabulary] = None, device: Optional[int] = -1) -> Trainer: trainer_params = config.pop("trainer") trainer_params["cuda_device"] = device model_params = config.pop("model") vocab = vocab or Vocabulary.from_instances(train_instances) model = Model.from_params(model_params, vocab=vocab) iterator = DataIterator.from_params(config.pop("iterator")) iterator.index_with(vocab) trainer = Trainer.from_params( model=model, iterator=iterator, train_data=train_instances, validation_data=val_instances, serialization_dir=None, params=trainer_params) return trainer
def test_mode_specified_in_reduce_on_plateau(self): # pylint: disable=protected-access for mode, metric in [("min", "-custom"), ("max", "+custom")]: trainer_params = Params({ "validation_metric": metric, "learning_rate_scheduler": { "type": "reduce_on_plateau", "mode": mode }, "optimizer": { "type": "adam", "lr": 0.01 } }) trainer = Trainer.from_params(model=self.model, serialization_dir=self.TEST_DIR, iterator=self.iterator, train_data=self.instances, validation_data=self.instances, params=trainer_params) assert trainer._learning_rate_scheduler.lr_scheduler.mode == mode
def test_mode_doesnt_agree_with_metric(self): # pylint: disable=protected-access for mode, metric in [("max", "-custom"), ("min", "+custom")]: trainer_params = Params({ "validation_metric": metric, "learning_rate_scheduler": { "type": "reduce_on_plateau", "mode": mode }, "optimizer": { "type": "adam", "lr": 0.01 } }) with self.assertLogs(logger="allennlp.training.util", level="WARNING"): # we warn when the metric and the mode don't agree trainer = Trainer.from_params(model=self.model, serialization_dir=self.TEST_DIR, iterator=self.iterator, train_data=self.instances, validation_data=self.instances, params=trainer_params) assert trainer._learning_rate_scheduler.lr_scheduler.mode == mode
def train_model(data_path, params, serialization_dir, cuda_device=-1, use_validation_data=True): os.makedirs(serialization_dir, exist_ok=True) handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) ds_params = params.pop('dataset_reader', {}) data_params = ds_params.pop('data', {}) dataset_reader = CMVReader.from_params(ds_params) logger.info('Reading training data...') train_data = dataset_reader.read('train', **data_params) #train_data_response_only_for_vocab = dataset_reader.read('train', response_only=True) #all_datasets = [train_data_response_only_for_vocab] all_datasets = [train_data] datasets_in_vocab = ['train'] #_response_only_for_vocab'] if use_validation_data: logger.info('Reading validation data...') data_params['weakpoints_only'] = False validation_data = dataset_reader.read('val', **data_params) all_datasets.append(validation_data) datasets_in_vocab.append('val') else: validation_data = None logger.info('Creating a vocabulary using %s data.', ', '.join(datasets_in_vocab)) vocab_params = params.pop('vocabulary', {}) dataset = None if 'directory_path' not in vocab_params: dataset = Batch([ instance for dataset in all_datasets for instance in dataset.instances ]) vocab = Vocabulary.from_params(vocab_params, dataset) vocab.save_to_files(os.path.join(serialization_dir, 'vocabulary')) iterator = DataIterator.from_params(params.pop('iterator')) cmv_predictor_params = params.pop('cmv_predictor') predictor_pretrained_params = cmv_predictor_params.pop( 'predictor_pretrained_params', None) cmv_predictor = Model.from_params(params=cmv_predictor_params, vocab=vocab) model_state = torch.load(predictor_pretrained_params['filename'], map_location=util.device_mapping( predictor_pretrained_params['cuda_device'])) cmv_predictor.load_state_dict(model_state) if params.pop('shared_embedder', False): print('using shared embedder') document_embedder = HierarchicalDocumentEmbedder( vocab, cmv_predictor._response_embedder, cmv_predictor._response_word_attention, cmv_predictor._response_encoder) else: document_embedder = Model.from_params( params=params.pop('document_embedder'), vocab=vocab) cmv_extractor = Model.from_params(params=params.pop('cmv_extractor')) cmv_discriminator = Model.from_params( params=params.pop('cmv_discriminator')) cmv_actor_critic_params = params.pop('cmv_actor_critic', None) cmv_actor_critic = None if cmv_actor_critic_params is not None: cmv_actor_critic = Model.from_params(params=cmv_actor_critic_params) train_data.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) trainer_params = params.pop("trainer", None) if trainer_params is not None: if cuda_device is not None: trainer_params["cuda_device"] = cuda_device trainer = Trainer.from_params(cmv_predictor, serialization_dir, iterator, train_data, validation_data, trainer_params) compress_response = params.pop('compress_response', False) generator_iterator = DataIterator.from_params( params.pop('generator_iterator')) cmv_actor_critic_trainer_params = params.pop('actor_critic_trainer', None) if cmv_actor_critic_trainer_params is not None: cmv_actor_critic_pretrainer = CMVActorCriticTrainer( document_embedder, cmv_predictor, cmv_extractor, cmv_actor_critic, cmv_actor_critic_trainer_params.pop('train_predictor', False), cmv_actor_critic_trainer_params.pop('train_fake_predictor', False), compress_response) cmv_actor_critic_serialization_dir = os.path.join( serialization_dir, 'actor_critic') cmv_actor_critic_trainer = Trainer.from_params( cmv_actor_critic_pretrainer, cmv_actor_critic_serialization_dir, generator_iterator, train_data, validation_data, cmv_actor_critic_trainer_params) else: ac_pretrained_params = params.pop('pretrained_actor_critic', None) if ac_pretrained_params is not None: cmv_actor_critic_pretrainer = CMVActorCriticTrainer( document_embedder, cmv_predictor, cmv_extractor, None) model_state = torch.load(ac_pretrained_params['filename'], map_location=util.device_mapping( ac_pretrained_params['cuda_device'])) cmv_actor_critic_pretrainer.load_state_dict(model_state) document_embedder = cmv_actor_critic_pretrainer._document_embedder cmv_predictor = cmv_actor_critic_pretrainer._cmv_predictor cmv_extractor = cmv_actor_critic_pretrainer._cmv_extractor generator = CMVGeneratorTrainer( document_embedder, cmv_predictor, cmv_extractor, cmv_discriminator, cmv_actor_critic, update_extractor=True, #cmv_actor_critic_trainer_params is None, update_gold_extractor=False, #True, compress_response=compress_response) #False) discriminator = CMVDiscriminatorTrainer(document_embedder, cmv_predictor, cmv_extractor, cmv_discriminator, compress_response) generator_serialization_dir = os.path.join(serialization_dir, 'generator') os.makedirs(generator_serialization_dir, exist_ok=True) generator_trainer = GANTrainer.from_params(generator, generator_serialization_dir, generator_iterator, train_data, validation_data, params.pop('generator_trainer')) discriminator_serialization_dir = os.path.join(serialization_dir, 'discriminator') os.makedirs(discriminator_serialization_dir, exist_ok=True) discriminator_trainer = GANTrainer.from_params( discriminator, discriminator_serialization_dir, iterator, train_data, validation_data, params.pop('discriminator_trainer')) #first train predictor for N steps if trainer_params is not None: trainer._num_epochs = 5 #hacky trainer.train() #TODO? then train actor critic for M steps #if we are using separate predictors, use the full CMV to train the extractor based on maximizing persuasiveness prediction if cmv_actor_critic_trainer_params is not None: cmv_actor_critic_trainer.train() #then alternate training between discriminator and generator for E epochs generator_trainer._num_epochs = 1 #hacky discriminator_trainer._num_epochs = 1 #hacky gan_epochs = params.pop("gan_epochs") for i in range(gan_epochs): discriminator_trainer.train() generator_trainer.train() discriminator_trainer._num_epochs += 1 #very hacky generator_trainer._num_epochs += 1 #also hacky #if cmv_actor_critic_trainer_params is not None: # cmv_actor_critic_trainer._num_epochs += 1 # cmv_actor_critic_trainer.train() # Now tar up results archive_model(serialization_dir) archive_model(generator_serialization_dir) archive_model(discriminator_serialization_dir) return generator
def train_model(db: FeverDocDB, params: Union[Params, Dict[str, Any]], cuda_device: int, serialization_dir: str, filtering: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ SimpleRandom.set_seeds() os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. ds_params = params.pop('dataset_reader', {}) dataset_reader = FEVERReader(db, sentence_level=ds_params.pop( "sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=TokenIndexer.dict_from_params( ds_params.pop('token_indexers', {})), filtering=filtering) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) all_datasets = [train_data] datasets_in_vocab = ["train"] validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets.append(validation_data) datasets_in_vocab.append("validation") else: validation_data = None logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_in_vocab)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), Dataset([ instance for dataset in all_datasets for instance in dataset.instances ])) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) train_data.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) trainer_params = params.pop("trainer") if cuda_device is not None: trainer_params["cuda_device"] = cuda_device trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) trainer.train() # Now tar up results archive_model(serialization_dir) return model
def find_learning_rate_model(params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results. start_lr: ``float`` Learning rate to start the search. end_lr: ``float`` Learning rate upto which search is done. num_batches: ``int`` Number of mini-batches to run Learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` force: ``bool`` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ if os.path.exists(serialization_dir) and force: shutil.rmtree(serialization_dir) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is ' f'not empty.') else: os.makedirs(serialization_dir, exist_ok=True) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer_choice = trainer_params.pop("type", "default") if trainer_choice != "default": raise ConfigurationError("currently find-learning-rate only works with the default Trainer") trainer = Trainer.from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=None, params=trainer_params, validation_iterator=None) logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.') learning_rates, losses = search_learning_rate(trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor) logger.info(f'Finished learning rate search.') losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
def find_learning_rate_model(params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` Parameters ---------- trainer: :class:`~allennlp.common.registrable.Registrable` params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results. start_lr: ``float`` Learning rate to start the search. end_lr: ``float`` Learning rate upto which search is done. num_batches: ``int`` Number of mini-batches to run Learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` force: ``bool`` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ if os.path.exists(serialization_dir) and force: shutil.rmtree(serialization_dir) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is ' f'not empty.') else: os.makedirs(serialization_dir, exist_ok=True) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, params=trainer_params, validation_data=None, validation_iterator=None) logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.') learning_rates, losses = search_learning_rate(trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor) logger.info(f'Finished learning rate search.') losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
if 'vocabulary' in params: vocab_params = params['vocabulary'] vocab = Vocabulary.from_params( params=vocab_params, instances=train_instances.extend(valid_instances)) else: vocab = Vocabulary.from_instances(train_instances.extend(valid_instances)) dep_model = Model.from_params(vocab=vocab, params=params['model']) print(dep_model) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_dataset = Batch(train_instances) train_dataset.index_instances(vocab) valid_dataset = Batch(valid_instances) valid_dataset.index_instances(vocab) # dep_trainer = Trainer(dep_model, dep_file_path, dep_iterator, dep_train_data, dep_valid_data) trainer_params = params.pop("trainer") trainer = Trainer.from_params(model=dep_model, serialization_dir='', iterator=iterator, train_data=train_dataset, validation_data=valid_dataset, params=trainer_params, validation_iterator=iterator) metrics = trainer.train() archive_model('data/output')
def train_model(params: Union[Params, Dict[str, Any]], cuda_device: int, serialization_dir: str, filtering: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ SimpleRandom.set_seeds() os.makedirs(serialization_dir, exist_ok=True) try: sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout, True) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr, True) # type: ignore except TypeError: sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. ds_params = params.pop('dataset_reader', {}) read_settings = ds_params.pop('read_settings', {}) dataset_reader = FEVERReader.from_params(ds_params) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read( train_data_path, include_metadata=True, replace_with_gold=read_settings.pop('replace_gold', False), pad_with_nearest=read_settings.pop('pad_with_nearest', 0)) validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path, include_metadata=True) else: validation_data = None vocab_params = params.pop("vocabulary", {}) dataset = None print(dict(vocab_params), 'directory_path' not in vocab_params) assert ('directory_path' in vocab_params) vocab = Vocabulary.from_params(vocab_params, dataset) print(vocab) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) trainer_params = params.pop("trainer") if cuda_device is not None: trainer_params["cuda_device"] = cuda_device trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) trainer.train() # Now tar up results archive_model(serialization_dir) return model
def find_learning_rate_model( params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False, ) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` # Parameters params : `Params` A parameter object specifying an AllenNLP Experiment. serialization_dir : `str` The directory in which to save results. start_lr : `float` Learning rate to start the search. end_lr : `float` Learning rate upto which search is done. num_batches : `int` Number of mini-batches to run Learning rate finder. linear_steps : `bool` Increase learning rate linearly if False exponentially. stopping_factor : `float` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If `None` search proceeds till the `end_lr` force : `bool` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ create_serialization_dir(params, serialization_dir, recover=False, force=force) prepare_environment(params) cuda_device = params.params.get("trainer").get("cuda_device", -1) check_for_gpu(cuda_device) distributed_params = params.params.get("distributed") # See https://github.com/allenai/allennlp/issues/3658 assert not distributed_params, "find-lr is not compatible with DistributedDataParallel." all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation), ) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), instances=(instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation), ) train_data = all_datasets["train"] train_data.index_with(vocab) model = Model.from_params(vocab=vocab, params=params.pop("model")) data_loader = DataLoader.from_params(dataset=train_data, params=params.pop("data_loader")) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer_choice = trainer_params.pop("type", "gradient_descent") if trainer_choice != "gradient_descent": raise ConfigurationError( "currently find-learning-rate only works with the GradientDescentTrainer" ) trainer: GradientDescentTrainer = Trainer.from_params( # type: ignore model=model, serialization_dir=serialization_dir, data_loader=data_loader, params=trainer_params, ) logger.info( f"Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations." ) learning_rates, losses = search_learning_rate( trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor, ) logger.info("Finished learning rate search.") losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, "lr-losses.png"))
def train_model(data_path, params, serialization_dir, cuda_device=-1, use_validation_data=True): os.makedirs(serialization_dir, exist_ok=True) handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) ds_params = params.pop('dataset_reader', {}) data_params = ds_params.pop('data', {}) dataset_reader = CMVReader.from_params(ds_params) ''' dataset_reader = CMVReader(data_path, tokenizer=Tokenizer.from_params(ds_params.pop('tokenizer', {})), token_indexers=TokenIndexer.from_params(ds_params.pop('token_indexers', {}))) ''' logger.info('Reading training data...') train_data = dataset_reader.read('train', **data_params) #train_data_response_only_for_vocab = dataset_reader.read('train', response_only=True) #train_data_op_only_for_vocab = dataset_reader.read('train', op_only=True) #all_datasets = [train_data_response_only_for_vocab, train_data_op_only_for_vocab] all_datasets = [train_data] datasets_in_vocab = ['train'] #_response_only_for_vocab'] if use_validation_data: logger.info('Reading validation data...') validation_data = dataset_reader.read('val', **data_params) all_datasets.append(validation_data) datasets_in_vocab.append('val') else: validation_data = None logger.info('Creating a vocabulary using %s data.', ', '.join(datasets_in_vocab)) vocab_params = params.pop('vocabulary', {}) dataset = None if 'directory_path' not in vocab_params: dataset = Batch([ instance for dataset in all_datasets for instance in dataset.instances ]) vocab = Vocabulary.from_params(vocab_params, dataset) vocab.save_to_files(os.path.join(serialization_dir, 'vocabulary')) model = Model.from_params(params=params.pop('model'), vocab=vocab) iterator = DataIterator.from_params(params.pop('iterator')) train_data.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) trainer_params = params.pop("trainer") if cuda_device is not None: trainer_params["cuda_device"] = cuda_device trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) trainer.train() # Now tar up results archive_model(serialization_dir) return model