def check_model_computes_gradients_correctly(model: Model, model_batch: Dict[str, Union[Any, Dict[str, Any]]], params_to_ignore: Set[str] = None): print("Checking gradients") model.zero_grad() result = model(**model_batch) result["loss"].backward() has_zero_or_none_grads = {} for name, parameter in model.named_parameters(): zeros = torch.zeros(parameter.size()) if params_to_ignore and name in params_to_ignore: continue if parameter.requires_grad: if parameter.grad is None: has_zero_or_none_grads[name] = "No gradient computed (i.e parameter.grad is None)" elif parameter.grad.is_sparse or parameter.grad.data.is_sparse: pass # Some parameters will only be partially updated, # like embeddings, so we just check that any gradient is non-zero. elif (parameter.grad.cpu() == zeros).all(): has_zero_or_none_grads[name] = f"zeros with shape ({tuple(parameter.grad.size())})" else: assert parameter.grad is None if has_zero_or_none_grads: for name, grad in has_zero_or_none_grads.items(): print(f"Parameter: {name} had incorrect gradient: {grad}") raise Exception("Incorrect gradients found. See stdout for more info.")
def test_mismatching_dimensions_throws_configuration_error(self): params = Params.from_file(self.param_file) # Make the phrase layer wrong - it should be 150 to match # the embedding + binary feature dimensions. params["model"]["encoder"]["input_size"] = 10 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.pop("model"))
def test_mismatching_dimensions_throws_configuration_error(self): params = Params.from_file(self.param_file) # Make the encoder wrong - it should be 2 to match # the embedding dimension from the text_field_embedder. params["model"]["encoder"]["input_size"] = 10 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.pop("model"))
def test_mismatching_contextualizer_unidirectionality_throws_configuration_error(self): params = Params.from_file(self.param_file) # Make the contextualizer unidirectionality wrong - it should be # False to match the language model. params["model"]["contextualizer"]["bidirectional"] = (not self.bidirectional) with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.get("model"))
def test_elmo_but_no_set_flags_throws_configuration_error(self): # pylint: disable=line-too-long params = Params.from_file(self.FIXTURES_ROOT / 'biattentive_classification_network' / 'elmo_experiment.json') # Elmo is specified in the model, but set both flags to false. params["model"]["use_input_elmo"] = False params["model"]["use_integrator_output_elmo"] = False with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.get("model"))
def test_mismatched_dimensions_raise_configuration_errors(self): params = Params.from_file(self.param_file) # Make the input_dim to the first feedforward_layer wrong - it should be 2. params["model"]["attend_feedforward"]["input_dim"] = 10 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.pop("model")) params = Params.from_file(self.param_file) # Make the projection output_dim of the last layer wrong - it should be # 3, equal to the number of classes. params["model"]["aggregate_feedforward"]["output_dim"] = 10 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.pop("model"))
def test_simple_tagger_constraint_type_deprecated(self): params = Params({"model": { "type": "crf_tagger", "constraint_type": "IOB1", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 50 }, } }, "encoder": { "type": "gru", "input_size": 50, "hidden_size": 10, "num_layers": 2, "dropout": 0.5, "bidirectional": True }}}) with pytest.warns(DeprecationWarning): model = Model.from_params(vocab=self.vocab, params=params.pop("model")) assert model._f1_metric is not None assert model._f1_metric._label_encoding == "IOB1" assert model.label_encoding == "IOB1" assert model.crf._constraint_mask.sum().item() != (model.num_tags + 2)**2
def test_batch_predictions_are_consistent(self): # The CNN encoder has problems with this kind of test - it's not properly masked yet, so # changing the amount of padding in the batch will result in small differences in the # output of the encoder. Because BiDAF is so deep, these differences get magnified through # the network and make this test impossible. So, we'll remove the CNN encoder entirely # from the model for this test. If/when we fix the CNN encoder to work correctly with # masking, we can change this back to how the other models run this test, with just a # single line. # pylint: disable=protected-access,attribute-defined-outside-init # Save some state. saved_model = self.model saved_instances = self.instances # Modify the state, run the test with modified state. params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) reader._token_indexers = {'tokens': reader._token_indexers['tokens']} self.instances = reader.read('tests/fixtures/data/squad.json') vocab = Vocabulary.from_instances(self.instances) for instance in self.instances: instance.index_fields(vocab) del params['model']['text_field_embedder']['token_characters'] params['model']['phrase_layer']['input_size'] = 2 self.model = Model.from_params(vocab, params['model']) self.ensure_batch_predictions_are_consistent() # Restore the state. self.model = saved_model self.instances = saved_instances
def setUp(self): super().setUp() params = Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 } }, "dataset_reader": {"type": "sequence_tagging"}, "train_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'), "validation_data_path": str(self.FIXTURES_ROOT / 'data' / 'sequence_tagging.tsv'), "iterator": {"type": "basic", "batch_size": 2}, "trainer": { "cuda_device": -1, "num_epochs": 2, "optimizer": "adam" } }) all_datasets = datasets_from_params(params) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for dataset in all_datasets.values() for instance in dataset) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] trainer_params = params.pop("trainer") serialization_dir = os.path.join(self.TEST_DIR, 'test_search_learning_rate') self.trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, params=trainer_params, validation_data=None, validation_iterator=None)
def test_forward_with_epoch_num_changes_cost_weight(self): # Redefining model. We do not want this to change the state of ``self.model``. params = Params.from_file(self.param_file) model = Model.from_params(vocab=self.vocab, params=params['model']) # Initial cost weight, before forward is called. assert model._checklist_cost_weight == 0.8 iterator = EpochTrackingBucketIterator(sorting_keys=[['sentence', 'num_tokens']]) cost_weights = [] for epoch_data in iterator(self.dataset, num_epochs=4): model.forward(**epoch_data) cost_weights.append(model._checklist_cost_weight) # The config file has ``wait_num_epochs`` set to 0, so the model starts decreasing the cost # weight at epoch 0 itself. assert_almost_equal(cost_weights, [0.72, 0.648, 0.5832, 0.52488])
def dry_run_from_params(params: Params, serialization_dir: str) -> None: prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError("The 'vocabulary' directory in the provided " "serialization directory is non-empty") all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) instances = [instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation] vocab = Vocabulary.from_params(vocab_params, instances) dataset = Batch(instances) dataset.index_instances(vocab) dataset.print_statistics() vocab.print_statistics() logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) model = Model.from_params(vocab=vocab, params=params.pop('model')) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name)
def set_up_model(self, param_file, dataset_file): # pylint: disable=attribute-defined-outside-init self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) instances = reader.read(dataset_file) vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(self.vocab, params['model']) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab)
def main(serialization_directory, device): """ serialization_directory : str, required. The directory containing the serialized weights. device: int, default = -1 The device to run the evaluation on. """ config = Params.from_file(os.path.join(serialization_directory, "config.json")) dataset_reader = DatasetReader.from_params(config['dataset_reader']) evaluation_data_path = config['validation_data_path'] model = Model.load(config, serialization_dir=serialization_directory, cuda_device=device) prediction_file_path = os.path.join(serialization_directory, "predictions.txt") gold_file_path = os.path.join(serialization_directory, "gold.txt") prediction_file = open(prediction_file_path, "w+") gold_file = open(gold_file_path, "w+") # Load the evaluation data and index it. print("Reading evaluation data from {}".format(evaluation_data_path)) instances = dataset_reader.read(evaluation_data_path) iterator = BasicIterator(batch_size=32) iterator.index_with(model.vocab) model_predictions = [] batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device, for_training=False) for batch in Tqdm.tqdm(batches): result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) for instance, prediction in zip(instances, model_predictions): fields = instance.fields try: # Most sentences have a verbal predicate, but not all. verb_index = fields["verb_indicator"].labels.index(1) except ValueError: verb_index = None gold_tags = fields["tags"].labels sentence = fields["tokens"].tokens write_to_conll_eval_file(prediction_file, gold_file, verb_index, sentence, prediction, gold_tags) prediction_file.close() gold_file.close()
def set_up_model(self, param_file, dataset_file): # pylint: disable=attribute-defined-outside-init self.param_file = param_file params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) instances = reader.read(dataset_file) # Use parameters for vocabulary if they are present in the config file, so that choices like # "non_padded_namespaces", "min_count" etc. can be set if needed. if 'vocabulary' in params: vocab_params = params['vocabulary'] vocab = Vocabulary.from_params(params=vocab_params, instances=instances) else: vocab = Vocabulary.from_instances(instances) self.vocab = vocab self.instances = instances self.model = Model.from_params(vocab=self.vocab, params=params['model']) # TODO(joelgrus) get rid of these # (a lot of the model tests use them, so they'll have to be changed) self.dataset = Batch(self.instances) self.dataset.index_instances(self.vocab)
def test_mismatching_dimensions_throws_configuration_error(self): params = Params.from_file(self.param_file) # Make the phrase layer wrong - it should be 10 to match # the embedding + char cnn dimensions. params["model"]["phrase_layer"]["input_size"] = 12 with pytest.raises(ConfigurationError): Model.from_params(self.vocab, params.pop("model")) params = Params.from_file(self.param_file) # Make the modeling layer input_dimension wrong - it should be 40 to match # 4 * output_dim of the phrase_layer. params["model"]["phrase_layer"]["input_size"] = 30 with pytest.raises(ConfigurationError): Model.from_params(self.vocab, params.pop("model")) params = Params.from_file(self.param_file) # Make the modeling layer input_dimension wrong - it should be 70 to match # 4 * phrase_layer.output_dim + 3 * modeling_layer.output_dim. params["model"]["span_end_encoder"]["input_size"] = 50 with pytest.raises(ConfigurationError): Model.from_params(self.vocab, params.pop("model"))
def test_elmo_num_repr_set_flags_mismatch_throws_configuration_error(self): # pylint: disable=line-too-long params = Params.from_file(self.FIXTURES_ROOT / 'biattentive_classification_network' / 'elmo_experiment.json') # Elmo is specified in the model, with num_output_representations=2. Set # only one flag to true. tmp_params = deepcopy(params) tmp_params["model"]["use_input_elmo"] = False with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=tmp_params.get("model")) tmp_params = deepcopy(params) tmp_params["model"]["use_input_elmo"] = True tmp_params["model"]["use_integrator_output_elmo"] = False with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=tmp_params.get("model")) # set num_output_representations to 1, and set both flags to True. tmp_params = deepcopy(params) tmp_params["model"]["elmo"]["num_output_representations"] = 1 tmp_params["model"]["use_input_elmo"] = True tmp_params["model"]["use_integrator_output_elmo"] = True with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=tmp_params.get("model"))
def test_no_elmo_but_set_flags_throws_configuration_error(self): params = Params.from_file(self.param_file) # There is no elmo specified in self.param_file, but set # use_input_elmo and use_integrator_output_elmo to True. # use_input_elmo set to True tmp_params = deepcopy(params) tmp_params["model"]["use_input_elmo"] = True with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=tmp_params.get("model")) # use_integrator_output_elmo set to True tmp_params = deepcopy(params) tmp_params["model"]["use_input_elmo"] = False tmp_params["model"]["use_integrator_output_elmo"] = True with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=tmp_params.get("model")) # both use_input_elmo and use_integrator_output_elmo set to True tmp_params = deepcopy(params) tmp_params["model"]["use_input_elmo"] = True tmp_params["model"]["use_integrator_output_elmo"] = True with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=tmp_params.get("model"))
def optimiser(model: Model) -> torch.optim.Optimizer: return AdamW(model.parameters(), lr=1e-3, weight_decay=1e-3)
def __init__(self, model: Model, dataset_reader: DatasetReader, frozen: bool = True) -> None: if frozen: model.eval() self._model = model self._dataset_reader = dataset_reader self.cuda_device = next(self._model.named_parameters())[1].get_device()
def find_learning_rate_model(params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` Parameters ---------- params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results. start_lr: ``float`` Learning rate to start the search. end_lr: ``float`` Learning rate upto which search is done. num_batches: ``int`` Number of mini-batches to run Learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` force: ``bool`` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ if os.path.exists(serialization_dir) and force: shutil.rmtree(serialization_dir) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is ' f'not empty.') else: os.makedirs(serialization_dir, exist_ok=True) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) check_for_gpu(cuda_device) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer_choice = trainer_params.pop("type", "default") if trainer_choice != "default": raise ConfigurationError("currently find-learning-rate only works with the default Trainer") trainer = Trainer.from_params(model=model, serialization_dir=serialization_dir, iterator=iterator, train_data=train_data, validation_data=None, params=trainer_params, validation_iterator=None) logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.') learning_rates, losses = search_learning_rate(trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor) logger.info(f'Finished learning rate search.') losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
def main(model_dir, model_type, compression_rate, max_sentences, model_index=None): print(compression_rate, max_sentences) i = 0 if model_index: i = model_index params = Params.from_file(os.path.join(model_dir, 'model_params.json')) ds_params = params.pop('dataset_reader', {}) data_params = ds_params.pop('data', {}) dataset_reader = CMVReader.from_params(ds_params) vocab = Vocabulary.from_params( Params({"directory_path": os.path.join(model_dir, 'vocabulary')})) val_iterator = DataIterator.from_params(params.pop('generator_iterator')) cmv_predictor = Model.from_params(params=params.pop('cmv_predictor'), vocab=vocab) document_embedder = Model.from_params( params=params.pop('document_embedder'), vocab=vocab) cmv_extractor = Model.from_params(params=params.pop('cmv_extractor')) cmv_actor_critic_params = params.pop('cmv_actor_critic', None) cmv_actor_critic = None if cmv_actor_critic_params is not None: cmv_actor_critic = Model.from_params(params=cmv_actor_critic_params) cmv_discriminator_params = params.pop('cmv_discriminator', None) cmv_discriminator = None if cmv_discriminator_params is not None: cmv_discriminator = Model.from_params(params=cmv_discriminator_params) params = dict(document_embedder=document_embedder, cmv_predictor=cmv_predictor, cmv_extractor=cmv_extractor, cmv_actor_critic=cmv_actor_critic) if model_type == 'generator': params.update(dict(cmv_discriminator=cmv_discriminator)) model = model_types[model_type](**params) data = dataset_reader.read('val', **data_params) data.index_instances(vocab) while True: model_filename = 'model_state_epoch_{}.th'.format(i) model_filename = os.path.join(os.path.join(model_dir, model_type), model_filename) print(model_filename) if not os.path.exists(model_filename): break #load file then do forward_on_instance model_state = torch.load(model_filename, map_location=util.device_mapping(-1)) model.load_state_dict(model_state) model.eval() val_generator = val_iterator(data, num_epochs=1, shuffle=False) model._cmv_extractor._compression_rate = compression_rate for batch in val_generator: #batch is a tensor dict document, mask = model._document_embedder(batch['original_post']) idxs, probs, gold_loss = model._cmv_extractor( document, mask, batch['label'], gold_evidence=batch['weakpoints'], n_abs=max_sentences) #extracted_sentences = extract(batch['original_post'], idxs) #fake_output = model._cmv_predictor(batch['response'], batch['label'], extracted_sentences) for bidx, e in enumerate(batch['weakpoints']): if int(e.ne(-1).sum()) == 0: continue print(e.numpy().tolist()) print(idxs[bidx].numpy().tolist()) for idx, sentence in enumerate( batch['original_post']['tokens'][bidx]): o = [ model._cmv_predictor.vocab.get_token_from_index( int(index), 'tokens').replace('@@end@@', '').replace( '@@UNKNOWN@@', 'UNK') for index in sentence if int(index) ] if len(o): print(idx, ' '.join(o)) print() #print(model._cmv_predictor.get_metrics(reset=True)) print(model._cmv_extractor.get_metrics(reset=True)) i += 1 if model_index is not None: break
def train_model(params: Union[Params, Dict[str, Any]], cuda_device: int, serialization_dir: str, filtering: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ SimpleRandom.set_seeds() os.makedirs(serialization_dir, exist_ok=True) try: sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout, True) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr, True) # type: ignore except TypeError: sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. ds_params = params.pop('dataset_reader', {}) read_settings = ds_params.pop('read_settings', {}) dataset_reader = FEVERReader.from_params(ds_params) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read( train_data_path, include_metadata=True, replace_with_gold=read_settings.pop('replace_gold', False), pad_with_nearest=read_settings.pop('pad_with_nearest', 0)) validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path, include_metadata=True) else: validation_data = None vocab_params = params.pop("vocabulary", {}) dataset = None print(dict(vocab_params), 'directory_path' not in vocab_params) assert ('directory_path' in vocab_params) vocab = Vocabulary.from_params(vocab_params, dataset) print(vocab) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) trainer_params = params.pop("trainer") if cuda_device is not None: trainer_params["cuda_device"] = cuda_device trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) trainer.train() # Now tar up results archive_model(serialization_dir) return model
def test_encoder_feedforward_dim_match(self): params = Params.from_file(self.feedforward_config) params['model']['context_encoder']['hidden_size'] = 5 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.get('model'))
params.pop(key, None) #Pdb().set_trace() pieces = gan_trainer_hm.TrainerPiecesForSemi.from_params( params, serialization_dir, args.recover, semi_supervision) # pylint: disable=no-member trainer = Trainer.from_params( model=pieces.model, serialization_dir=serialization_dir, iterator=pieces.iterator, train_data=pieces.train_dataset, validation_data=pieces.validation_dataset, params=pieces.params, validation_iterator=pieces.validation_iterator) #pieces for constrained learning" constraints_model = Model.from_params( vocab=pieces.model.vocab, params=params.pop('dd_constraints')) dd_params = [[n, p] for n, p in constraints_model.named_parameters() if p.requires_grad] dd_optimizer = None if len(dd_params) > 0: dd_optimizer = Optimizer.from_params(dd_params, params.pop("dd_optimizer")) else: _ = params.pop('dd_optimizer') params.assert_empty('base train command') try: semi_trainer = gan_trainer_hm.SemiSupervisedTrainer( trainer, constraints_model, dd_optimizer, pieces.validation_iterator, pieces.unlabelled_dataset, semi_supervision, which_mixer, dd_warmup_iters, dd_update_freq,
def run( # type: ignore self, model: Model, dataset: DatasetDict, split: str = "validation", data_loader: Optional[Lazy[TangoDataLoader]] = None, ) -> EvaluationResult: """ Runs an evaluation on a dataset. * `model` is the model we want to evaluate. * `dataset` is the dataset we want to evaluate on. * `split` is the name of the split we want to evaluate on. * `data_loader` gives you the chance to choose a custom dataloader for the evaluation. By default this step evaluates on batches of 32 instances each. """ concrete_data_loader: TangoDataLoader if data_loader is None: concrete_data_loader = BatchSizeDataLoader(dataset.splits[split], batch_size=32, shuffle=False) else: concrete_data_loader = data_loader.construct( instances=dataset.splits[split]) if torch.cuda.device_count() > 0: cuda_device = torch.device(0) else: cuda_device = torch.device("cpu") check_for_gpu(cuda_device) generator_tqdm = Tqdm.tqdm(iter(concrete_data_loader)) # Number of batches in instances. predictions: List[Dict[str, Any]] = [] # Number of batches where the model produces a loss. loss_count = 0 batch_count = 0 # Cumulative loss total_loss = 0.0 with torch.inference_mode(): model.eval() for batch in concrete_data_loader: batch_count += 1 batch = move_to_device(batch, cuda_device) output_dict = model(**batch) metrics = model.get_metrics() loss = output_dict.pop("loss", None) if loss is not None: loss_count += 1 total_loss += loss.item() metrics["loss"] = total_loss / loss_count if any( metric_name.startswith("_") for metric_name in metrics): self.logger.warning_once( 'Metrics with names beginning with "_" will ' "not be logged to the tqdm progress bar.") description = (", ".join([ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ]) + " ||") generator_tqdm.set_description(description, refresh=False) output_dict = sanitize(output_dict) # This is write-only code, but it's quite fast. predictions.extend( dict(zip(output_dict.keys(), x)) for x in zip(*output_dict.values())) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check if loss_count != batch_count: raise RuntimeError( "The model you are trying to evaluate only sometimes produced a loss!" ) final_metrics["loss"] = total_loss / loss_count return self.EvaluationResult(final_metrics, predictions)
def test_model_load(self): params = Params.from_file('tests/fixtures/srl/experiment.json') model = Model.load(params) assert isinstance(model, SemanticRoleLabeler)
def test_embedding_encoder_dim_match(self): params = Params.from_file(self.in_context_config) params['model']['context_field_embedder']["token_embedders"]['tokens']["embedding_dim"] = 5 with pytest.raises(ConfigurationError): Model.from_params(vocab=self.vocab, params=params.get('model'))
def test_model_load(self): params = Params.from_file(self.FIXTURES_ROOT / 'decomposable_attention' / 'experiment.json') model = Model.load(params, serialization_dir=self.FIXTURES_ROOT / 'decomposable_attention' / 'serialization') assert isinstance(model, DecomposableAttention)
def from_params(params: Params, serialization_dir: str, recover: bool = False) -> 'TrainerPieces': all_datasets = multitask_datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) if recover and os.path.exists(os.path.join(serialization_dir, "vocabulary")): vocab = Vocabulary.from_files(os.path.join(serialization_dir, "vocabulary")) params.pop("vocabulary", {}) else: vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) # If vocab extension is ON for training, embedding extension should also be # done. If vocab and embeddings are already in sync, it would be a no-op. model.extend_embedder_vocab() # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params(validation_iterator_params) validation_iterator.index_with(model.vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) return MultiTaskTrainerPieces(model, iterator, train_data, validation_data, test_data, validation_iterator, trainer_params)
def __init__(self, vocab: Vocabulary, kg_model: Model = None, entity_embedding: Embedding = None, concat_entity_embedder: EntityEmbedder = None, contextual_embedding_dim: int = None, span_encoder_config: Dict[str, int] = None, margin: float = 0.2, decode_threshold: float = 0.0, loss_type: str = 'margin', max_sequence_length: int = 512, dropout: float = 0.1, output_feed_forward_hidden_dim: int = 100, initializer_range: float = 0.02, include_null_embedding_in_dot_attention: bool = False, namespace: str = 'entity', regularizer: RegularizerApplicator = None): super().__init__(vocab, margin=margin, decode_threshold=decode_threshold, loss_type=loss_type, namespace=namespace, regularizer=regularizer) num_embeddings_passed = sum( [kg_model is not None, entity_embedding is not None, concat_entity_embedder is not None] ) if num_embeddings_passed != 1: raise ValueError("Linking model needs either a kg factorisation model or an entity embedding.") elif kg_model is not None: entity_embedding = kg_model.get_entity_embedding() entity_embedding_dim = entity_embedding.embedding_dim elif entity_embedding is not None: entity_embedding_dim = entity_embedding.get_output_dim() elif concat_entity_embedder is not None: entity_embedding_dim = concat_entity_embedder.get_output_dim() set_requires_grad(concat_entity_embedder, False) entity_embedding = concat_entity_embedder if loss_type == 'margin': weighted_entity_threshold = decode_threshold else: weighted_entity_threshold = None null_entity_id = self.vocab.get_token_index('@@NULL@@', namespace) assert null_entity_id != self.vocab.get_token_index('@@UNKNOWN@@', namespace) self.disambiguator = EntityDisambiguator( contextual_embedding_dim, entity_embedding_dim=entity_embedding_dim, entity_embeddings=entity_embedding, max_sequence_length=max_sequence_length, span_encoder_config=span_encoder_config, dropout=dropout, output_feed_forward_hidden_dim=output_feed_forward_hidden_dim, initializer_range=initializer_range, weighted_entity_threshold=weighted_entity_threshold, include_null_embedding_in_dot_attention=include_null_embedding_in_dot_attention, null_entity_id=null_entity_id)
def dry_run_from_params(params: Params, serialization_dir: str, force: bool = False, cache_directory: str = None, cache_prefix: str = None) -> None: """ :param params: :param serialization_dir: force : ``bool``, optional (default=False) If ``True``, we will overwrite the serialization directory if it already exists. cache_directory : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. cache_prefix : ``str``, optional For caching data pre-processing. See :func:`allennlp.training.util.datasets_from_params`. :return: """ if os.path.exists(serialization_dir) and force: shutil.rmtree(serialization_dir) prepare_environment(params) vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError("The 'vocabulary' directory in the provided " "serialization directory is non-empty") all_datasets = datasets_from_params(params, cache_directory, cache_prefix) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) instances = [instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation] vocab = Vocabulary.from_params(vocab_params, instances) dataset = Batch(instances) dataset.index_instances(vocab) dataset.print_statistics() vocab.print_statistics() logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) model = Model.from_params(vocab=vocab, params=params.pop('model')) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name)
loader_params = {'batch_size': 96 // NUM_GPUS, 'num_gpus': NUM_GPUS, 'num_workers': num_workers} only_use_relevant_dets = False vcr_modes = VCR.eval_splits(embs_to_load=params['dataset_reader'].get('embs', 'bert_da'), only_use_relevant_dets=params['dataset_reader'].get('only_use_relevant_dets', only_use_relevant_dets)) probs_grp = [] ids_grp = [] for (vcr_dataset, mode_long) in zip(vcr_modes, ['answer'] + [f'rationale_{i}' for i in range(4)]): mode = mode_long.split('_')[0] test_loader = VCRLoader.from_dataset(vcr_dataset, **loader_params) # Load the params again because allennlp will delete them... ugh. params = Params.from_file(args.params) print("Loading {} for {}".format(params['model'].get('type', 'WTF?'), mode), flush=True) model = Model.from_params(vocab=vcr_dataset.vocab, params=params['model']) for submodule in model.detector.backbone.modules(): if isinstance(submodule, BatchNorm2d): submodule.track_running_stats = False model_state = torch.load(getattr(args, f'{mode}_ckpt'), map_location=device_mapping(-1)) model.load_state_dict(model_state) model = DataParallel(model).cuda() if NUM_GPUS > 1 else model.cuda() model.eval() test_probs = [] test_ids = [] for b, (time_per_batch, batch) in enumerate(time_batch(test_loader)): with torch.no_grad(): batch = _to_gpu(batch)
probs_grp.append(probs) ids_grp.append(ids) # Double check the IDs are in the same order for everything assert [x == ids_grp[0] for x in ids_grp] probs_grp = np.stack(probs_grp, 1).reshape((-1, 20)) to_leaderboard_csv(probs_grp, ids_grp[0], args.outfile) if __name__ == '__main__': args = parse_args() params = Params.from_file(args.params) multitask = 'MultiTask' in params['model']['type'] model = Model.from_params(params=params['model']) LOG.info('Loaded model {} from {}'.format(params['model'].get('type', ''), args.params)) num_gpus = torch.cuda.device_count() assert num_gpus >= 1, "No CUDA devices found" LOG.info('Found {} GPUs'.format(num_gpus)) model = DataParallel(model).cuda() if num_gpus > 1 else model.cuda() if args.answer_model or args.rationale_model: assert args.split == 'val', "Not yet supported" compute_baseline(model, params, args) if args.ar_model and not multitask: if args.split == 'val': joint_eval(model, params, args)
mask = util.get_text_field_mask(text) # Shape: (batch_size, encoding_dim) encoded_text = self.encoder(embedded_text, mask) # Shape: (batch_size, num_labels) logits = self.classifier(encoded_text) # Shape: (batch_size, num_labels) probs = torch.nn.functional.softmax(logits) # Shape: (1,) loss = torch.nn.functional.cross_entropy(logits, label) return {'loss': loss, 'probs': probs} iterator = BasicIterator(batch_size=2) iterator.index_with(vocab) model_params = """ { "type": "simple_classifier", "embedder": {"token_embedders": { "tokens": {"type": "embedding", "embedding_dim": 10} }}, "encoder": {"type": "bag_of_embeddings"} } """ model = Model.from_params(vocab, Params(json.loads(model_params))) for batch in iterator(instances, num_epochs=1): outputs = model(batch) print(f"Model outputs: {outputs}")
def test_model_load(self): params = Params.from_file('tests/fixtures/decomposable_attention/experiment.json') model = Model.load(params, serialization_dir='tests/fixtures/decomposable_attention/serialization') assert isinstance(model, DecomposableAttention)
def _from_params( cls, # type: ignore model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], params: Params, validation_iterator: DataIterator = None) -> DecompTrainer: # pylint: disable=arguments-differ patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) validation_data_path = params.pop("validation_data_path", None) validation_prediction_path = params.pop("validation_prediction_path", None) semantics_only = params.pop("semantics_only", False) drop_syntax = params.pop("drop_syntax", True) include_attribute_scores = params.pop("include_attribute_scores", False) warmup_epochs = params.pop("warmup_epochs", 0) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) bert_optim_params = params.pop("bert_optimizer", None) bert_name = "_bert_encoder" if bert_optim_params is not None: tune_after_layer_num = params.pop("bert_tune_layer", 12) frozen_regex_str = [ "(_bert_encoder\.bert_model\.embeddings.*)", "(_bert_encoder\.bert_model\.pooler.*)" ] tune_regex_str = [] for i in range(0, 12): # match all numbers greater than layer num via disjunction tune_regex_one = f"({bert_name}\.bert_model\.encoder\.layer\.{i}\..*)" if i >= tune_after_layer_num: tune_regex_str.append(tune_regex_one) else: frozen_regex_str.append(tune_regex_one) tune_regex = re.compile("|".join(tune_regex_str)) frozen_regex = re.compile("|".join(frozen_regex_str)) # decide which params require grad for which optimizer all_names = [n for n, p in model.named_parameters()] tune_bert_names = [ n for n in all_names if tune_regex.match(n) is not None ] frozen_names = [ n for n in all_names if frozen_regex.match(n) is not None ] # assert that they're disjoint assert (len(set(frozen_names) & set(tune_bert_names)) == 0) # set tunable params to require gradient, frozen ones to not require for i, (n, p) in enumerate(model.named_parameters()): if n in frozen_names: p.requires_grad = False else: p.requires_grad = True # extract BERT bert_params = [[n, p] for n, p in model.named_parameters() if p.requires_grad and n in tune_bert_names] # make sure this matches the tuneable bert params assert ([x[0] for x in bert_params] == tune_bert_names) bert_optimizer = Optimizer.from_params(bert_params, bert_optim_params) else: # freeze all BERT params tune_bert_names = [] bert_optimizer = None for i, (n, p) in enumerate(model.named_parameters()): if "_bert_encoder" in n: p.requires_grad = False # model params parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad and n not in tune_bert_names] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params( optimizer, momentum_scheduler_params) else: momentum_scheduler = None if 'checkpointer' in params: if 'keep_serialized_model_every_num_seconds' in params or \ 'num_serialized_models_to_keep' in params: raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods.") checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds= keep_serialized_model_every_num_seconds) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool( "should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) syntactic_method = params.pop("syntactic_method", None) accumulate_batches = params.pop("accumulate_batches", 1) params.assert_empty(cls.__name__) return cls(model=model, optimizer=optimizer, bert_optimizer=bert_optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=validation_data, validation_data_path=validation_data_path, validation_prediction_path=validation_prediction_path, semantics_only=semantics_only, warmup_epochs=warmup_epochs, syntactic_method=syntactic_method, drop_syntax=drop_syntax, include_attribute_scores=include_attribute_scores, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average, accumulate_batches=accumulate_batches)
def train_model(db: FeverDocDB, params: Union[Params, Dict[str, Any]], cuda_device: int, serialization_dir: str, filtering: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ SimpleRandom.set_seeds() os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout, True) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr, True) # type: ignore handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "config.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. ds_params = params.pop('dataset_reader', {}) dataset_reader = FEVERReader( db, sentence_level=ds_params.pop("sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=FEVERReader.custom_dict_from_params( ds_params.pop('token_indexers', {})), filtering=filtering) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) all_datasets = [train_data] datasets_in_vocab = ["train"] validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets.append(validation_data) datasets_in_vocab.append("validation") else: validation_data = None logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_in_vocab)) #handle all_datasets vocab = Vocabulary.from_params( params.pop("vocabulary", {}), [instance for dataset in all_datasets for instance in dataset]) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) try: model = Model.from_params(params.pop('model'), vocab=vocab) iterator = DataIterator.from_params(params.pop("iterator")) except Exception as e: logger.info("Crashed with error: " + str(e)) iterator.index_with(vocab) trainer_params = params.pop("trainer") if cuda_device is not None: trainer_params["cuda_device"] = cuda_device trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) trainer.train() # Now tar up results archive_model(serialization_dir) return model
def find_learning_rate_model(params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` Parameters ---------- trainer: :class:`~allennlp.common.registrable.Registrable` params : ``Params`` A parameter object specifying an AllenNLP Experiment. serialization_dir : ``str`` The directory in which to save results. start_lr: ``float`` Learning rate to start the search. end_lr: ``float`` Learning rate upto which search is done. num_batches: ``int`` Number of mini-batches to run Learning rate finder. linear_steps: ``bool`` Increase learning rate linearly if False exponentially. stopping_factor: ``float`` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If ``None`` search proceeds till the ``end_lr`` force: ``bool`` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ if os.path.exists(serialization_dir) and force: shutil.rmtree(serialization_dir) if os.path.exists(serialization_dir) and os.listdir(serialization_dir): raise ConfigurationError(f'Serialization directory {serialization_dir} already exists and is ' f'not empty.') else: os.makedirs(serialization_dir, exist_ok=True) prepare_environment(params) cuda_device = params.params.get('trainer').get('cuda_device', -1) if isinstance(cuda_device, list): for device in cuda_device: check_for_gpu(device) else: check_for_gpu(cuda_device) all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info("From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), (instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation) ) model = Model.from_params(vocab=vocab, params=params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(vocab) train_data = all_datasets['train'] trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, params=trainer_params, validation_data=None, validation_iterator=None) logger.info(f'Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations.') learning_rates, losses = search_learning_rate(trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor) logger.info(f'Finished learning rate search.') losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, 'lr-losses.png'))
def find_learning_rate_model( params: Params, serialization_dir: str, start_lr: float = 1e-5, end_lr: float = 10, num_batches: int = 100, linear_steps: bool = False, stopping_factor: float = None, force: bool = False, ) -> None: """ Runs learning rate search for given `num_batches` and saves the results in ``serialization_dir`` # Parameters params : `Params` A parameter object specifying an AllenNLP Experiment. serialization_dir : `str` The directory in which to save results. start_lr : `float` Learning rate to start the search. end_lr : `float` Learning rate upto which search is done. num_batches : `int` Number of mini-batches to run Learning rate finder. linear_steps : `bool` Increase learning rate linearly if False exponentially. stopping_factor : `float` Stop the search when the current loss exceeds the best loss recorded by multiple of stopping factor. If `None` search proceeds till the `end_lr` force : `bool` If True and the serialization directory already exists, everything in it will be removed prior to finding the learning rate. """ create_serialization_dir(params, serialization_dir, recover=False, force=force) prepare_environment(params) cuda_device = params.params.get("trainer").get("cuda_device", -1) check_for_gpu(cuda_device) distributed_params = params.params.get("distributed") # See https://github.com/allenai/allennlp/issues/3658 assert not distributed_params, "find-lr is not compatible with DistributedDataParallel." all_datasets = datasets_from_params(params) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation), ) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), instances=(instance for key, dataset in all_datasets.items() for instance in dataset if key in datasets_for_vocab_creation), ) train_data = all_datasets["train"] train_data.index_with(vocab) model = Model.from_params(vocab=vocab, params=params.pop("model")) data_loader = DataLoader.from_params(dataset=train_data, params=params.pop("data_loader")) trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) trainer_choice = trainer_params.pop("type", "gradient_descent") if trainer_choice != "gradient_descent": raise ConfigurationError( "currently find-learning-rate only works with the GradientDescentTrainer" ) trainer: GradientDescentTrainer = Trainer.from_params( # type: ignore model=model, serialization_dir=serialization_dir, data_loader=data_loader, params=trainer_params, ) logger.info( f"Starting learning rate search from {start_lr} to {end_lr} in {num_batches} iterations." ) learning_rates, losses = search_learning_rate( trainer, start_lr=start_lr, end_lr=end_lr, num_batches=num_batches, linear_steps=linear_steps, stopping_factor=stopping_factor, ) logger.info("Finished learning rate search.") losses = _smooth(losses, 0.98) _save_plot(learning_rates, losses, os.path.join(serialization_dir, "lr-losses.png"))
def get_knowbert(vocab, mode, include_wiki=False): params = { "type": "knowbert", "mode": mode, "soldered_kgs": { "wordnet": { "type": "soldered_kg", "entity_linker": { "type": "entity_linking_with_candidate_mentions", "kg_model": { "type": "from_archive", "archive_file": ARCHIVE_FILE, }, "contextual_embedding_dim": 12, "max_sequence_length": 64, "span_encoder_config": { "hidden_size": 24, "num_hidden_layers": 1, "num_attention_heads": 3, "intermediate_size": 37 }, }, "span_attention_config": { "hidden_size": 24, "num_hidden_layers": 2, "num_attention_heads": 4, "intermediate_size": 55 } }, }, "soldered_layers": { "wordnet": 1 }, "bert_model_name": "tests/fixtures/bert/bert_test_fixture.tar.gz", } if include_wiki: params["soldered_kgs"]["wiki"] = { "type": "soldered_kg", "entity_linker": { "type": "entity_linking_with_candidate_mentions", "namespace": "entity_wiki", "entity_embedding": { "num_embeddings": 14, "embedding_dim": 24, }, "contextual_embedding_dim": 12, "max_sequence_length": 64, "span_encoder_config": { "hidden_size": 24, "num_hidden_layers": 1, "num_attention_heads": 3, "intermediate_size": 37 }, }, "span_attention_config": { "hidden_size": 24, "num_hidden_layers": 1, "num_attention_heads": 4, "intermediate_size": 55 } } params["soldered_layers"]["wiki"] = 0 params["soldered_kgs"]["wordnet"]["entity_linker"][ "namespace"] = "entity_wordnet" model = Model.from_params(Params(params), vocab=vocab) return model
def evaluate_perplexity(model: Model, sampler: Model, num_samples: int, instances: Iterator[Instance], data_iterator: DataIterator, cuda_device: int) -> Dict[str, Any]: check_for_gpu(cuda_device) logger.info('Iterating over dataset') with torch.no_grad(): summands = [] penalized_summands = [] for i in range(num_samples): iterator = data_iterator(instances, num_epochs=1, shuffle=False) generator_tqdm = Tqdm.tqdm(iterator, total=0) model.eval() sampler.eval() summand = 0.0 penalized_summand = 0.0 denom = 0 for batch, _ in generator_tqdm: batch = util.move_to_device(batch, cuda_device) # We need sequence length to help compute perplexity n_tokens = util.get_text_field_mask( batch['source']).float().sum().item() denom += n_tokens # Draw a sample sampler_output = sampler.sample(**batch) sample_logp = sampler_output['logp'] sample = sampler_output['sample'] # Evaluate on sample model_output = model(**sample) model_logp = model_output['logp'] model_penalized_logp = model_output['penalized_logp'] summand += (model_logp - sample_logp).item() penalized_summand += (model_penalized_logp - sample_logp).item() summands.append(summand) penalized_summands.append(penalized_summand) t = torch.tensor(summands) p = torch.tensor(penalized_summands) t_sum = torch.logsumexp(t, dim=0) p_sum = torch.logsumexp(p, dim=0) sum_logp = (t_sum - math.log(i + 1)).item() sum_logp_penalized = (p_sum - math.log(i + 1)).item() ppl = math.exp(-sum_logp / denom) upp = math.exp(-sum_logp_penalized / denom) print('PPL: %f' % ppl) print('UPP: %f' % upp) metrics = {'ppl': ppl, 'upp': upp} return metrics
def _load_model(self, basedir): config_path = os.path.join(basedir, 'config.json') config = Params.from_file(config_path) model = Model.load(config=config, serialization_dir=basedir) sentiment_map = self._vocab_to_sentiment_map(model.vocab) return model, sentiment_map
def load_weights(model: Model, path: str, location: str = 'cpu') -> None: with open(path, 'rb') as f: model.load_state_dict(torch.load(f, map_location=location))
def _load_model(basedir): config_path = os.path.join(basedir, 'config.json') config = Params.from_file(config_path) model = Model.load(config=config, serialization_dir=basedir) return model
def _load(config: Params, adapters_dir: str, serialization_dir: str, weights_file: str = None, cuda_device: int = -1) -> 'Model': """ Instantiates an already-trained model, based on the experiment configuration and some optional overrides. """ weights_file = weights_file or os.path.join(serialization_dir, "best.th") # Load vocabulary from file vocab_dir = os.path.join(serialization_dir, 'vocabulary') # If the config specifies a vocabulary subclass, we need to use it. vocab_params = config.get("vocabulary", Params({})) vocab_choice = vocab_params.pop_choice("type", Vocabulary.list_available(), True) vocab = Vocabulary.by_name(vocab_choice).from_files(vocab_dir) model_params = config.get('model') # The experiment config tells us how to _train_ a model, including where to get pre-trained # embeddings from. We're now _loading_ the model, so those embeddings will already be # stored in our weights. We don't need any pretrained weight file anymore, and we don't # want the code to look for it, so we remove it from the parameters here. remove_pretrained_embedding_params(model_params) model = Model.from_params(vocab=vocab, params=model_params) # If vocab+embedding extension was done, the model initialized from from_params # and one defined by state dict in weights_file might not have same embedding shapes. # Eg. when model embedder module was transferred along with vocab extension, the # initialized embedding weight shape would be smaller than one in the state_dict. # So calling model embedding extension is required before load_state_dict. # If vocab and model embeddings are in sync, following would be just a no-op. model.extend_embedder_vocab() # model_state = torch.load(weights_file, map_location=util.device_mapping(cuda_device)) # model.load_state_dict(model_state, strict=False) for file in os.listdir(adapters_dir): logger.info(f"{file} is loading..") # loop over the adapters folder and load weights into a dictionary for i, layer in enumerate(model.text_field_embedder.token_embedder_bert.bert_model.encoder.layer): try: for j, (file, attention_adapter, output_attention) in enumerate(zip(os.listdir(adapters_dir), layer.attention.output.adapter, layer.output.adapter)): adapter_state = torch.load(os.path.join(adapters_dir, file)) attention_adapter.load_state_dict(adapter_state['attention_adapter_' + str(i)]) output_attention.load_state_dict(adapter_state['output_adapter_' + str(i)]) except AttributeError: logger.warning(f"Could not find the adapter model inside the archive {adapters_dir}") traceback.print_exc() return # Force model to cpu or gpu, as appropriate, to make sure that the embeddings are # in sync with the weights if cuda_device >= 0: model.cuda(cuda_device) else: model.cpu() return model
def evaluate(model: Model, instances: Iterable[Instance], data_iterator: DataIterator, cuda_device: int, serialization_dir: str, eval_suffix: str, batch_weight_key: str) -> Dict[str, Any]: check_for_gpu(cuda_device) nlp = spacy.load("en_core_web_lg") assert not os.path.exists( os.path.join(serialization_dir, f'generations{eval_suffix}.jsonl')) # caching saves us extra 30 minutes if 'goodnews' in serialization_dir: cache_path = 'data/goodnews/evaluation_cache.pkl' elif 'nytimes' in serialization_dir: cache_path = 'data/nytimes/evaluation_cache.pkl' if os.path.exists(cache_path): with open(cache_path, 'rb') as f: cache = pickle.load(f) else: cache = {} with torch.no_grad(): model.eval() iterator = data_iterator(instances, num_epochs=1, shuffle=False) logger.info("Iterating over dataset") generator_tqdm = Tqdm.tqdm( iterator, total=data_iterator.get_num_batches(instances)) # Number of batches in instances. batch_count = 0 # Number of batches where the model produces a loss. loss_count = 0 # Cumulative weighted loss total_loss = 0.0 # Cumulative weight across all batches. total_weight = 0.0 for batch in generator_tqdm: batch_count += 1 batch = nn_util.move_to_device(batch, cuda_device) output_dict = model(**batch) loss = output_dict.get("loss") write_to_json(output_dict, serialization_dir, nlp, eval_suffix, cache) metrics = model.get_metrics() if loss is not None: loss_count += 1 if batch_weight_key: weight = output_dict[batch_weight_key].item() else: weight = 1.0 total_weight += weight total_loss += loss.item() * weight # Report the average loss so far. metrics["loss"] = total_loss / total_weight if (not HasBeenWarned.tqdm_ignores_underscores and any( metric_name.startswith("_") for metric_name in metrics)): logger.warning("Metrics with names beginning with \"_\" will " "not be logged to the tqdm progress bar.") HasBeenWarned.tqdm_ignores_underscores = True description = ', '.join([ "%s: %.2f" % (name, value) for name, value in metrics.items() if not name.startswith("_") ]) + " ||" generator_tqdm.set_description(description, refresh=False) final_metrics = model.get_metrics(reset=True) if loss_count > 0: # Sanity check # if loss_count != batch_count: # raise RuntimeError("The model you are trying to evaluate only sometimes " + # "produced a loss!") final_metrics["loss"] = total_loss / total_weight if not os.path.exists(cache_path): with open(cache_path, 'wb') as f: pickle.dump(cache, f) return final_metrics
num_workers = (4 * NUM_GPUS if NUM_CPUS == 32 else 2 * NUM_GPUS) - 1 print(f"Using {num_workers} workers out of {NUM_CPUS} possible", flush=True) loader_params = { 'batch_size': 96 // NUM_GPUS, 'num_gpus': NUM_GPUS, 'num_workers': num_workers } train_loader = VCRLoader.from_dataset(train, **loader_params) val_loader = VCRLoader.from_dataset(val, **loader_params) test_loader = VCRLoader.from_dataset(test, **loader_params) ARGS_RESET_EVERY = 100 print("Loading {} for {}".format(params['model'].get('type', 'WTF?'), 'rationales' if args.rationale else 'answer'), flush=True) model = Model.from_params(vocab=train.vocab, params=params['model']) for submodule in model.detector.backbone.modules(): if isinstance(submodule, BatchNorm2d): submodule.track_running_stats = False for p in submodule.parameters(): p.requires_grad = False model = DataParallel(model).cuda() if NUM_GPUS > 1 else model.cuda() optimizer = Optimizer.from_params( [x for x in model.named_parameters() if x[1].requires_grad], params['trainer']['optimizer']) lr_scheduler_params = params['trainer'].pop("learning_rate_scheduler", None) scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) if lr_scheduler_params else None
def __init__( self, id: str, registered_model_name: Optional[str] = None, model_class: Optional[type] = None, registered_predictor_name: Optional[str] = None, display_name: Optional[str] = None, archive_file: Optional[str] = None, overrides: Optional[Dict] = None, model_details: Optional[Union[str, ModelDetails]] = None, intended_use: Optional[Union[str, IntendedUse]] = None, factors: Optional[Union[str, Factors]] = None, metrics: Optional[Union[str, Metrics]] = None, evaluation_data: Optional[Union[str, EvaluationData]] = None, training_data: Optional[Union[str, TrainingData]] = None, quantitative_analyses: Optional[Union[str, QuantitativeAnalyses]] = None, ethical_considerations: Optional[Union[str, EthicalConsiderations]] = None, caveats_and_recommendations: Optional[Union[ str, CaveatsAndRecommendations]] = None, ): assert id if not model_class and registered_model_name: try: model_class = Model.by_name(registered_model_name) except ConfigurationError: logger.warning("{} is not a registered model.".format( registered_model_name)) if model_class: display_name = display_name or model_class.__name__ model_details = model_details or get_description(model_class) if not registered_predictor_name: registered_predictor_name = model_class.default_predictor # type: ignore if archive_file and not archive_file.startswith("https:"): archive_file = os.path.join(self._storage_location, archive_file) if isinstance(model_details, str): model_details = ModelDetails(description=model_details) if isinstance(intended_use, str): intended_use = IntendedUse(primary_uses=intended_use) if isinstance(factors, str): factors = Factors(relevant_factors=factors) if isinstance(metrics, str): metrics = Metrics(model_performance_measures=metrics) if isinstance(evaluation_data, str): evaluation_data = EvaluationData(dataset=evaluation_data) if isinstance(training_data, str): training_data = TrainingData(dataset=training_data) if isinstance(quantitative_analyses, str): quantitative_analyses = QuantitativeAnalyses( unitary_results=quantitative_analyses) if isinstance(ethical_considerations, str): ethical_considerations = EthicalConsiderations( ethical_considerations) if isinstance(caveats_and_recommendations, str): caveats_and_recommendations = CaveatsAndRecommendations( caveats_and_recommendations) self.id = id self.registered_model_name = registered_model_name self.registered_predictor_name = registered_predictor_name self.display_name = display_name self.archive_file = archive_file self.model_details = model_details self.intended_use = intended_use self.factors = factors self.metrics = metrics self.evaluation_data = evaluation_data self.training_data = training_data self.quantitative_analyses = quantitative_analyses self.ethical_considerations = ethical_considerations self.caveats_and_recommendations = caveats_and_recommendations
def test_model_load(self): params = Params.from_file( 'tests/fixtures/decomposable_attention/experiment.json') model = Model.load(params) assert isinstance(model, DecomposableAttention)
def initialize_model(self, args): model = Model.from_params(vocab=None, params=Params(args.model)) if args.get("fp16", False): model.half() print("Using FP 16, Model Halfed") self.model = DataParallel(model).cuda()