def test_fine_tune_nograd_regex(self): original_model = load_archive(self.model_archive).model name_parameters_original = dict(original_model.named_parameters()) regex_lists = [[], [".*attend_feedforward.*", ".*token_embedder.*"], [".*compare_feedforward.*"]] for regex_list in regex_lists: params = Params.from_file(self.config_file) params["trainer"]["no_grad"] = regex_list shutil.rmtree(self.serialization_dir, ignore_errors=True) tuned_model = fine_tune_model(model=original_model, params=params, serialization_dir=self.serialization_dir) # If regex is matched, parameter name should have requires_grad False # If regex is matched, parameter name should have same requires_grad # as the originally loaded model for name, parameter in tuned_model.named_parameters(): if any(re.search(regex, name) for regex in regex_list): assert not parameter.requires_grad else: assert parameter.requires_grad \ == name_parameters_original[name].requires_grad # If all parameters have requires_grad=False, then error. with pytest.raises(Exception) as _: params = Params.from_file(self.config_file) params["trainer"]["no_grad"] = ["*"] shutil.rmtree(self.serialization_dir, ignore_errors=True) tuned_model = fine_tune_model(model=original_model, params=params, serialization_dir=self.serialization_dir)
def fine_tune_model_from_file_paths(model_archive_path: str, config_file: str, serialization_dir: str, overrides: str = "", file_friendly_logging: bool = False) -> Model: """ A wrapper around :func:`fine_tune_model` which loads the model archive from a file. Parameters ---------- model_archive_path : ``str`` Path to a saved model archive that is the result of running the ``train`` command. config_file : ``str`` A configuration file specifying how to continue training. The format is identical to the configuration file for the ``train`` command, but any contents in the ``model`` section is ignored (as we are using the provided model archive instead). serialization_dir : ``str`` The directory in which to save results and logs. We just pass this along to :func:`fine_tune_model`. overrides : ``str`` A JSON string that we will use to override values in the input parameter file. file_friendly_logging : ``bool``, optional (default=False) If ``True``, we make our output more friendly to saved model files. We just pass this along to :func:`fine_tune_model`. """ # We don't need to pass in `cuda_device` here, because the trainer will call `model.cuda()` if # necessary. archive = load_archive(model_archive_path) params = Params.from_file(config_file, overrides) return fine_tune_model(model=archive.model, params=params, serialization_dir=serialization_dir, file_friendly_logging=file_friendly_logging)
def test_fine_tune_runtime_errors_with_vocab_expansion(self): params = Params.from_file(self.config_file) params["train_data_path"] = str(self.FIXTURES_ROOT / 'data' / 'snli2.jsonl') model = load_archive(self.model_archive).model # If we do vocab expansion, we get a runtime error because of the embedding. with pytest.raises(RuntimeError): fine_tune_model(model, params, self.serialization_dir, extend_vocab=True)
def test_fine_tune_does_not_expand_vocab_by_default(self): params = Params.from_file(self.config_file) # snli2 has a new token in it params["train_data_path"] = str(self.FIXTURES_ROOT / 'data' / 'snli2.jsonl') model = load_archive(self.model_archive).model # By default, no vocab expansion. fine_tune_model(model, params, self.serialization_dir)
def ensure_model_can_train_save_and_load( self, param_file: Union[PathLike, str], tolerance: float = 1e-4, cuda_device: int = -1, gradients_to_ignore: Set[str] = None, overrides: str = "", metric_to_check: str = None, metric_terminal_value: float = None, metric_tolerance: float = 1e-4, disable_dropout: bool = True, ): """ # Parameters param_file : `str` Path to a training configuration file that we will use to train the model for this test. tolerance : `float`, optional (default=`1e-4`) When comparing model predictions between the originally-trained model and the model after saving and loading, we will use this tolerance value (passed as `rtol` to `numpy.testing.assert_allclose`). cuda_device : `int`, optional (default=`-1`) The device to run the test on. gradients_to_ignore : `Set[str]`, optional (default=`None`) This test runs a gradient check to make sure that we're actually computing gradients for all of the parameters in the model. If you really want to ignore certain parameters when doing that check, you can pass their names here. This is not recommended unless you're `really` sure you don't need to have non-zero gradients for those parameters (e.g., some of the beam search / state machine models have infrequently-used parameters that are hard to force the model to use in a small test). overrides : `str`, optional (default = `""`) A JSON string that we will use to override values in the input parameter file. metric_to_check: `str`, optional (default = `None`) We may want to automatically perform a check that model reaches given metric when training (on validation set, if it is specified). It may be useful in CI, for example. You can pass any metric that is in your model returned metrics. metric_terminal_value: `str`, optional (default = `None`) When you set `metric_to_check`, you need to set the value this metric must converge to metric_tolerance: `float`, optional (default=`1e-4`) Tolerance to check you model metric against metric terminal value. One can expect some variance in model metrics when the training process is highly stochastic. disable_dropout : `bool`, optional (default = `True`) If True we will set all dropout to 0 before checking gradients. (Otherwise, with small datasets, you may get zero gradients because of unlucky dropout.) """ save_dir = self.TEST_DIR / "save_and_load_test" archive_file = save_dir / "model.tar.gz" model = train_model_from_file(param_file, save_dir, overrides=overrides) metrics_file = save_dir / "metrics.json" if metric_to_check is not None: metrics = json.loads(metrics_file.read_text()) metric_value = metrics.get( f"best_validation_{metric_to_check}") or metrics.get( f"training_{metric_to_check}") assert metric_value is not None, f"Cannot find {metric_to_check} in metrics.json file" assert metric_terminal_value is not None, "Please specify metric terminal value" assert abs(metric_value - metric_terminal_value) < metric_tolerance archive = load_archive(archive_file, cuda_device=cuda_device) loaded_model = archive.model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose( model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key, ) reader = archive.dataset_reader params = Params.from_file(param_file, params_overrides=overrides) print("Reading with original model") model_dataset = reader.read(params["validation_data_path"]) model_dataset.index_with(model.vocab) print("Reading with loaded model") loaded_dataset = reader.read(params["validation_data_path"]) loaded_dataset.index_with(loaded_model.vocab) # Need to duplicate params because DataLoader.from_params will consume. data_loader_params = params["data_loader"] data_loader_params["shuffle"] = False data_loader_params2 = Params( copy.deepcopy(data_loader_params.as_dict())) data_loader = DataLoader.from_params(dataset=model_dataset, params=data_loader_params) data_loader2 = DataLoader.from_params(dataset=loaded_dataset, params=data_loader_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_batch = next(iter(data_loader)) loaded_batch = next(iter(data_loader2)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(model, model_batch, gradients_to_ignore, disable_dropout) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, "stateful") and module.stateful: module.reset_states() print("Predicting with original model") model_predictions = model(**model_batch) print("Predicting with loaded model") loaded_model_predictions = loaded_model(**loaded_batch) # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model.train() loaded_model_predictions = loaded_model(**loaded_batch) loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() return model, loaded_model
def test_train_model_distributed_with_gradient_accumulation( self, max_instances, grad_acc, batch_size): if torch.cuda.device_count() >= 2: devices = [0, 1] else: devices = [-1, -1] params = lambda: Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 }, }, "dataset_reader": { "type": "sequence_tagging", "max_instances": max_instances }, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, "data_loader": { "batch_size": batch_size }, "trainer": { "num_epochs": 2, "optimizer": "adam", "num_gradient_accumulation_steps": grad_acc, }, "distributed": { "cuda_devices": devices }, }) out_dir = os.path.join(self.TEST_DIR, "test_distributed_train_with_grad_acc") train_model(params(), serialization_dir=out_dir) # Check that some logs specific to distributed # training are where we expect. serialized_files = os.listdir(out_dir) assert "out_worker0.log" in serialized_files assert "out_worker1.log" in serialized_files assert "model.tar.gz" in serialized_files assert "metrics.json" in serialized_files # Make sure the metrics look right. with open(os.path.join(out_dir, "metrics.json")) as f: metrics = json.load(f) assert metrics["peak_worker_0_memory_MB"] > 0 assert metrics["peak_worker_1_memory_MB"] > 0 if torch.cuda.device_count() >= 2: assert metrics["peak_gpu_0_memory_MB"] > 0 assert metrics["peak_gpu_1_memory_MB"] > 0 # Check we can load the serialized model assert load_archive(out_dir).model
def main(): # Load SNLI dataset bert_indexer = PretrainedTransformerIndexer('bert-base-uncased') tokenizer = PretrainedTransformerTokenizer(model_name='bert-base-uncased') reader = SnliReader(token_indexers={'tokens': bert_indexer}, tokenizer=tokenizer, combine_input_fields=True) # single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer # tokenizer = WordTokenizer(end_tokens=["@@NULL@@"]) # add @@NULL@@ to the end of sentences # reader = SnliReader(token_indexers={'tokens': single_id_indexer}, tokenizer=tokenizer) dev_dataset = reader.read( 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_dev.jsonl' ) # Load model and vocab model_type = "pred" # model_type = "merged" if model_type == "merged": model = load_archive( '/home/junliw/gradient-regularization/SNLI/archives/bert_models/merged_model.tar.gz' ).model elif model_type == "pred": model = load_archive( '/home/junliw/gradient-regularization/SNLI/archives/bert_models/bert_trained2.tar.gz' ).model model.eval().cuda() vocab = model.vocab # add hooks for embeddings so we can compute gradients w.r.t. to the input tokens utils.add_hooks(model) if model_type == "merged": embedding_weight = model.combined_model._text_field_embedder._modules[ "token_embedder_tokens"].transformer_model.embeddings.word_embeddings.weight # save the word embedding matrix else: embedding_weight = model._text_field_embedder._modules[ "token_embedder_tokens"].transformer_model.embeddings.word_embeddings.weight # print(model.combined_model._text_field_embedder._modules["token_embedder_tokens"].transformer_model.embeddings.word_embeddings) # print(embedding_weight.size()) # Batches of examples to construct triggers universal_perturb_batch_size = 32 # iterator = DataIterator(batch_size=universal_perturb_batch_size) # iterator.index_with(vocab) # Subsample the dataset to one class to do a universal attack on that class dataset_label_filter = 'entailment' # only entailment examples # dataset_label_filter = 'contradiction' # only contradiction examples # dataset_label_filter = 'neutral' # only neutral examples subset_dev_dataset = [] for instance in dev_dataset: if instance['label'].label == dataset_label_filter: subset_dev_dataset.append(instance) print(len(subset_dev_dataset)) print(len(dev_dataset)) # the attack is targeted towards a specific class # target_label = "0" # flip to entailment target_label = "1" # flip to contradiction # target_label = "2" # flip to neutral # A k-d tree if you want to do gradient + nearest neighbors #tree = KDTree(embedding_weight.numpy()) # Get original accuracy before adding universal triggers utils.get_accuracy(model, subset_dev_dataset, vocab, tokenizer, model_type, trigger_token_ids=None, snli=True) model.train() # rnn cannot do backwards in train mode # Initialize triggers num_trigger_tokens = 2 # one token prepended start_tok = tokenizer.tokenizer.encode("a")[1] print(start_tok) trigger_token_ids = [start_tok] * num_trigger_tokens # sample batches, update the triggers, and repeat subset_dev_dataset_dataset = AllennlpDataset(dev_dataset, vocab) train_sampler = BucketBatchSampler(subset_dev_dataset_dataset, batch_size=universal_perturb_batch_size, sorting_keys=["tokens"]) train_dataloader = DataLoader(subset_dev_dataset_dataset, batch_sampler=train_sampler) # for batch in lazy_groups_of(iterators(subset_dev_dataset, num_epochs=10, shuffle=True), group_size=1): for batch in train_dataloader: # get model accuracy with current triggers utils.get_accuracy(model, subset_dev_dataset, vocab, tokenizer, model_type, trigger_token_ids, snli=True) model.train() # rnn cannot do backwards in train mode # get grad of triggers averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids, target_label, snli=True) # find attack candidates using an attack method cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad, embedding_weight, trigger_token_ids, increase_loss=False, num_candidates=40) print("------") print(cand_trigger_token_ids) # cand_trigger_token_ids = attacks.random_attack(embedding_weight, # trigger_token_ids, # num_candidates=40) # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad, # embedding_weight, # trigger_token_ids, # tree, # 100, # decrease_prob=True) # query the model to get the best candidates trigger_token_ids = utils.get_best_candidates(model, batch, trigger_token_ids, cand_trigger_token_ids, snli=True)
def build_specter_vectors(hf_dataset: str, specter_path: str, output_path: str, cuda_device: int = -1, batch_size: int = 32, vector_size: int = 768, override=False): """ Run with: $ ./data_cli.py build_specter_vectors paperswithcode_aspects ./specter_archive ./output/pwc_doc_id2specter.w2v.txt --cuda_device=5 Download specter: $ wget https://ai2-s2-research-public.s3-us-west-2.amazonaws.com/specter/archive.tar.gz $ tar -xzvf archive.tar.gz :param vector_size: :param output_path: ./output :param override: :param cuda_device: :param batch_size: :param hf_dataset: :param specter_path: Path to specter :return: """ from specter.predict_command import predictor_from_archive from allennlp.models import load_archive # load to register from specter.model import Model from specter.data import DataReader, DataReaderFromPickled from specter.predictor import SpecterPredictor if Model and DataReader and SpecterPredictor: pass if os.path.exists(output_path) and not override: logger.error(f'Output file exists already: {output_path}') return # Dataset docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir='./data/nlp_cache', split='docs') logger.info(f'Documents loaded: {len(docs_ds):,}') papers_to_embed = [doc for doc in docs_ds] # Specter settings archive_path = os.path.join(specter_path, 'model.tar.gz') metadata_path = os.path.join(specter_path, 'metadata_sample.json') included_text_fields = 'abstract title' vocab_dir = os.path.join(specter_path, 'data/vocab/') cuda_device = int(cuda_device) overrides = f"{{'model':{{'predict_mode':'true','include_venue':'false'}},'dataset_reader':{{'type':'specter_data_reader','predict_mode':'true','paper_features_path':'{metadata_path}','included_text_fields': '{included_text_fields}'}},'vocabulary':{{'directory_path':'{vocab_dir}'}}}}" logger.info(f'SPECTER overrides: {overrides}') archive = load_archive(archive_path, cuda_device=cuda_device, overrides=overrides) predictor = predictor_from_archive(archive, predictor_name='specter_predictor', paper_features_path=metadata_path) # Batches def chunks(lst, chunk_size): """Splits a longer list to respect batch size""" for i in range(0, len(lst), chunk_size): yield lst[i:i + chunk_size] batches_count = int(len(papers_to_embed) / batch_size) batch_embed_papers = [] # 30min on GPU for batch in tqdm(chunks(papers_to_embed, batch_size), total=batches_count): batch_out = predictor.predict_batch_json(batch) batch_embed_papers += batch_out # To keyed vectors doc_model = KeyedVectors(vector_size=vector_size) for embed_paper in tqdm(batch_embed_papers): doc_model.add([embed_paper['paper_id']], [embed_paper['embedding']]) # Save to disk doc_model.save_word2vec_format(output_path) logger.info('Done')
parser.add_argument('--db', type=str, help='/path/to/saved/db.db') parser.add_argument('--drqa-model', type=str, help='/path/to/saved/db.db') parser.add_argument('--rte-model', type=str, help='/path/to/saved/db.db') parser.add_argument('--max-page', type=int, default=5) parser.add_argument('--max-sent', type=int, default=5) parser.add_argument("--cuda-device", type=int, default=-1, help='id of GPU to use (if any)') args = parser.parse_args() logger.info("Load DB") db = FeverDocDB(args.db) logger.info("Load RTE-Model") archive = load_archive(args.rte_model, cuda_device=args.cuda_device) logger.info("Init Retriever") evidence_retriever = EvidenceRetrieval(db, args.drqa_model, args.max_page, args.max_sent) config = archive.config ds_params = config["dataset_reader"] model = archive.model model.eval() reader = FEVERReader( db, sentence_level=ds_params.pop("sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer',
def load(self, path: str): self.predictor = Predictor.from_archive( load_archive(path, cuda_device=self.cuda), self.task)
mentions += random.choices(mentions, k=sample_size - len(mentions)) return mentions def _json_to_instance(self, json_dict: JsonDict) -> Instance: mentions = json_dict['mentions'] assert len(mentions) == self._dataset_reader.sentence_sample instance = self._dataset_reader.text_to_instance(sentences=mentions) return instance if __name__ == '__main__': model_path = sys.argv[1] archive = load_archive(model_path, overrides=PersonPredictor.overrides) predictor = Predictor.from_archive(archive, 'person-predictor') result = predictor.predict_json({ "mentions": PersonPredictor.select_mentions([ "@@mb@@ Perelman @@me@@ is Russian writer", "Millennium Prize Problem was solved by @@mb@@ him @@me@@ in 1998 and then he died", ], predictor._dataset_reader.sentence_sample) }) labels = archive.model.vocab.get_index_to_token_vocabulary("labels") predicted_labels = dict((labels[idx], prob) for idx, prob in enumerate(result['predictions'])
def __init__(self, vocab: Vocabulary, token_representation_dim: int, encoder: Optional[Seq2SeqEncoder] = None, decoder: Optional[Union[FeedForward, str]] = None, use_crf: bool = False, constrain_crf_decoding: bool = False, include_start_end_transitions: bool = True, label_encoding: Optional[str] = None, contextualizer: Optional[Contextualizer] = None, calculate_per_label_f1: bool = False, calculate_span_f1: bool = False, calculate_perplexity: bool = False, loss_average: str = "batch", pretrained_file: Optional[str] = None, transfer_contextualizer_from_pretrained_file: bool = False, transfer_encoder_from_pretrained_file: bool = False, freeze_encoder: bool = False, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(Tagger, self).__init__(vocab, regularizer) self._num_classes = self.vocab.get_vocab_size("labels") self._token_representation_dim = token_representation_dim self._contextualizer = contextualizer if encoder is None: encoder = PassThroughEncoder(input_dim=token_representation_dim) self._encoder = encoder # Load the contextualizer and encoder weights from the # pretrained_file if applicable if pretrained_file: archive = None if self._contextualizer and transfer_contextualizer_from_pretrained_file: logger.info("Attempting to load contextualizer weights from " "pretrained_file at {}".format(pretrained_file)) archive = load_archive(cached_path(pretrained_file)) contextualizer_state = archive.model._contextualizer.state_dict() contextualizer_layer_num = self._contextualizer._layer_num logger.info("contextualizer_layer_num {}".format(contextualizer_layer_num)) self._contextualizer.load_state_dict(contextualizer_state) if contextualizer_layer_num is not None: logger.info("Setting layer num to {}".format( contextualizer_layer_num)) self._contextualizer.set_layer_num(contextualizer_layer_num) else: self._contextualizer.reset_layer_num() logger.info("Successfully loaded contextualizer weights!") if transfer_encoder_from_pretrained_file: logger.info("Attempting to load encoder weights from " "pretrained_file at {}".format(pretrained_file)) if archive is None: archive = load_archive(cached_path(pretrained_file)) encoder_state = archive.model._encoder.state_dict() self._encoder.load_state_dict(encoder_state) logger.info("Successfully loaded encoder weights!") self._freeze_encoder = freeze_encoder for parameter in self._encoder.parameters(): # If freeze is true, requires_grad should be false and vice versa. parameter.requires_grad_(not self._freeze_encoder) if decoder is None or decoder == "linear": # Create the default decoder (logistic regression) if it is not provided. decoder = FeedForward.from_params(Params( {"input_dim": self._encoder.get_output_dim(), "num_layers": 1, "hidden_dims": self._num_classes, "activations": "linear"})) logger.info("No decoder provided to model, using default " "decoder: {}".format(decoder)) elif decoder == "mlp": # Create the MLP decoder decoder = FeedForward.from_params(Params( {"input_dim": self._encoder.get_output_dim(), "num_layers": 2, "hidden_dims": [1024, self._num_classes], "activations": ["relu", "linear"]})) logger.info("Using MLP decoder: {}".format(decoder)) self._decoder = TimeDistributed(decoder) self._use_crf = use_crf self._constrain_crf_decoding = constrain_crf_decoding self._crf = None if use_crf: logger.info("Using CRF on top of decoder outputs") if constrain_crf_decoding: if label_encoding is None: raise ConfigurationError( "constrain_crf_decoding is True, but " "label_encoding was not provided. label_encoding " "must be provided.") logger.info("Constraining CRF decoding with label " "encoding {}".format(label_encoding)) labels = self.vocab.get_index_to_token_vocabulary("labels") constraints = allowed_transitions(label_encoding, labels) else: constraints = None self._crf = ConditionalRandomField( self._num_classes, constraints, include_start_end_transitions=include_start_end_transitions) check_dimensions_match(self._token_representation_dim, self._encoder.get_input_dim(), "dimensionality of token representation", "encoder input dim") check_dimensions_match(self._encoder.get_output_dim(), self._decoder._module.get_input_dim(), "encoder output dim", "decoder input dim") check_dimensions_match(self._decoder._module.get_output_dim(), self._num_classes, "decoder output dim", "number of classes") if loss_average not in {"batch", "token"}: raise ConfigurationError("loss_average is {}, expected one of batch " "or token".format(loss_average)) self.loss_average = loss_average self.metrics = { "accuracy": CategoricalAccuracy(), "accuracy3": CategoricalAccuracy(top_k=3) } self.calculate_perplexity = calculate_perplexity if calculate_perplexity: self.metrics["perplexity"] = Perplexity() self.calculate_per_label_f1 = calculate_per_label_f1 self.calculate_span_f1 = calculate_span_f1 if label_encoding and label_encoding not in ["BIO", "BIOUL", "IOB1"]: raise ConfigurationError("If not None, label encoding must be one of BIO, BIOUL, " "or IOB1. Got {}".format(label_encoding)) self.label_encoding = label_encoding label_metric_name = "label_{}" if self.calculate_per_label_f1 else "_label_{}" for label_name, label_index in self.vocab._token_to_index["labels"].items(): self.metrics[label_metric_name.format(label_name)] = F1Measure(positive_label=label_index) if self.calculate_span_f1: if not self.label_encoding: raise ConfigurationError("label_encoding must be provided when " "calculating_span_f1 is true.") else: # Set up span-based F1 measure self.metrics["span_based_f1"] = SpanBasedF1Measure(self.vocab, tag_namespace="labels", label_encoding=self.label_encoding) # Whether to run in error analysis mode or not, see commands.error_analysis self.error_analysis = False logger.info("Applying initializer...") initializer(self)
def from_params(cls, vocab: Vocabulary, params: Params) -> 'ESIMPtrExtractor': entailment_params = params.pop("entailment_esim") fix_entailment_params = params.pop('fix_entailment_params', False) if 'archive_file' in entailment_params: model = load_archive(entailment_params.pop('archive_file')).model if model._combine_feedforward is not None: model._entailment_esim._combine_feedforward = model._combine_feedforward if model._aggregate_feedforward is not None: model._entailment_esim._aggregate_feedforward = model._aggregate_feedforward entailment_esim = model._entailment_esim fix_entailment_params = entailment_params.pop( 'fix_entailment_params', True) if fix_entailment_params: for parameter in entailment_esim.parameters(): parameter.requires_grad = False elif entailment_params.pop('model', None) == 'feature_model': weights_file = entailment_params.pop('weights_file', None) entailment_esim = FeatureModel(**entailment_params) if weights_file is not None: entailment_esim.load_state_dict(torch.load(weights_file)) else: entailment_esim = ESIM.from_params(vocab, entailment_params) sentence_selection_params = params.pop("sentence_esim") pretrained_ptr_extractor = None fix_sentence_selection_esim_params = False if 'archive_file' in sentence_selection_params: archive_file = sentence_selection_params.pop('archive_file') pretrained_ptr_extractor = load_archive(archive_file).model sentence_selection_esim = pretrained_ptr_extractor._entailment_esim fix_sentence_selection_esim_params = sentence_selection_params.pop( 'fix_sentence_selection_esim_params', False) if fix_sentence_selection_esim_params: for parameter in sentence_selection_esim.parameters(): parameter.requires_grad = False elif sentence_selection_params.pop('model', None) == 'feature_model': sentence_selection_esim = FeatureModel(**sentence_selection_params) else: sentence_selection_esim = ESIM.from_params( vocab, sentence_selection_params, vocab_weight=entailment_esim._text_field_embedder. token_embedder_tokens.weight.data) ptr_extract_summ_params = params.pop('ptr_extract_summ') fix_ptr_extract_summ_params = False if 'archive_file' in ptr_extract_summ_params: archive_file = ptr_extract_summ_params.pop('archive_file') if pretrained_ptr_extractor is None: pretrained_ptr_extractor = load_archive(archive_file).model ptr_extract_summ_params[ 'pretrained'] = pretrained_ptr_extractor._ptr_extract_summ fix_ptr_extract_summ_params = ptr_extract_summ_params.pop( 'fix_ptr_extract_summ_params', False) if fix_ptr_extract_summ_params: for parameter in ptr_extract_summ_params[ 'pretrained'].parameters(): parameter.requires_grad = False ptr_extract_summ = ActorCritic(**ptr_extract_summ_params) initializer = InitializerApplicator.from_params( params.pop('initializer', [])) regularizer = RegularizerApplicator.from_params( params.pop('regularizer', [])) ei_reward_weight = params.pop("ei_reward_weight", 1) nei_label = params.pop("nei_label", 0) train_gold_evidence = params.pop("train_gold_evidence", False) use_decoder_states = params.pop("use_decoder_states", False) beam_size = params.pop("beam_size", 5) fix_sentence_extraction_params = params.pop( "fix_sentence_extraction_params", False) params.assert_empty(cls.__name__) return cls( vocab=vocab, sentence_selection_esim=sentence_selection_esim, entailment_esim=entailment_esim, ptr_extract_summ=ptr_extract_summ, initializer=initializer, regularizer=regularizer, ei_reward_weight=ei_reward_weight, fix_entailment_params=fix_entailment_params, fix_sentence_extraction_params=fix_sentence_extraction_params or fix_ptr_extract_summ_params and fix_sentence_selection_esim_params, nei_label=nei_label, train_gold_evidence=train_gold_evidence, use_decoder_states=use_decoder_states, beam_size=beam_size)
single_id_indexer = SingleIdTokenIndexer( lowercase_tokens=True) # word tokenizer # use_subtrees gives us a bit of extra data by breaking down each example into sub sentences. tokenizer = WordTokenizer( end_tokens=["@@NULL@@"]) # add @@NULL@@ to the end of sentences reader = SnliReader(token_indexers={'tokens': single_id_indexer}, tokenizer=tokenizer) dev_dataset = reader.read( 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_dev.jsonl' ) # Load model and vocab model = load_archive( 'https://allennlp.s3-us-west-2.amazonaws.com/models/esim-glove-snli-2019.04.23.tar.gz' ).model model.train().cuda() snli_vocab = model.vocab mask_word_ARAE = [] ARAE_words = list(ARAE_word2idx.keys()) for word in ARAE_words: if snli_vocab.get_token_index(word) == 1: mask_word_ARAE.append(ARAE_word2idx[word]) mask_word_ARAE = np.array(list(set(mask_word_ARAE))) mask_ARAE_logits = np.zeros((1, 1, len(ARAE_words))) mask_ARAE_logits[:, :, mask_word_ARAE] = -float("Inf")
def ensure_model_can_train_save_and_load(self, param_file: str): save_dir = os.path.join(self.TEST_DIR, "save_and_load_test") archive_file = os.path.join(save_dir, "model.tar.gz") model = train_model_from_file(param_file, save_dir) loaded_model = load_archive(archive_file).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose(model.state_dict()[key].numpy(), loaded_model.state_dict()[key].numpy(), err_msg=key) params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) iterator = DataIterator.from_params(params['iterator']) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_dataset = reader.read(params['validation_data_path']) model_dataset.index_instances(model.vocab) model_batch_arrays = next(iterator(model_dataset, shuffle=False)) model_batch = arrays_to_variables(model_batch_arrays, for_training=False) loaded_dataset = reader.read(params['validation_data_path']) loaded_dataset.index_instances(loaded_model.vocab) loaded_batch_arrays = next(iterator(loaded_dataset, shuffle=False)) loaded_batch = arrays_to_variables(loaded_batch_arrays, for_training=False) # The datasets themselves should be identical. for key in model_batch.keys(): field = model_batch[key] if isinstance(field, dict): for subfield in field: self.assert_fields_equal(model_batch[key][subfield], loaded_batch[key][subfield], tolerance=1e-6, name=key + '.' + subfield) else: self.assert_fields_equal(model_batch[key], loaded_batch[key], 1e-6, key) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() model_predictions = model.forward(**model_batch) loaded_model_predictions = loaded_model.forward(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], tolerance=1e-4, name=key) return model, loaded_model
def ensure_model_can_train_save_and_load( self, param_file: str, tolerance: float = 1e-4, cuda_device: int = -1, gradients_to_ignore: Set[str] = None, overrides: str = "", disable_dropout: bool = True, ): """ # Parameters param_file : ``str`` Path to a training configuration file that we will use to train the model for this test. tolerance : ``float``, optional (default=1e-4) When comparing model predictions between the originally-trained model and the model after saving and loading, we will use this tolerance value (passed as ``rtol`` to ``numpy.testing.assert_allclose``). cuda_device : ``int``, optional (default=-1) The device to run the test on. gradients_to_ignore : ``Set[str]``, optional (default=None) This test runs a gradient check to make sure that we're actually computing gradients for all of the parameters in the model. If you really want to ignore certain parameters when doing that check, you can pass their names here. This is not recommended unless you're `really` sure you don't need to have non-zero gradients for those parameters (e.g., some of the beam search / state machine models have infrequently-used parameters that are hard to force the model to use in a small test). overrides : ``str``, optional (default = "") A JSON string that we will use to override values in the input parameter file. disable_dropout : ``bool``, optional (default = True) If True we will set all dropout to 0 before checking gradients. (Otherwise, with small datasets, you may get zero gradients because of unlucky dropout.) """ save_dir = self.TEST_DIR / "save_and_load_test" archive_file = save_dir / "model.tar.gz" model = train_model_from_file(param_file, save_dir, overrides=overrides) loaded_model = load_archive(archive_file, cuda_device=cuda_device).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose( model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key, ) params = Params.from_file(param_file, params_overrides=overrides) reader = DatasetReader.from_params(params["dataset_reader"]) # Need to duplicate params because Iterator.from_params will consume. iterator_params = params["iterator"] iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict())) iterator = DataIterator.from_params(iterator_params) iterator2 = DataIterator.from_params(iterator_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_dataset = reader.read(params["validation_data_path"]) iterator.index_with(model.vocab) model_batch = next(iterator(model_dataset, shuffle=False)) loaded_dataset = reader.read(params["validation_data_path"]) iterator2.index_with(loaded_model.vocab) loaded_batch = next(iterator2(loaded_dataset, shuffle=False)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly( model, model_batch, gradients_to_ignore, disable_dropout ) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, "stateful") and module.stateful: module.reset_states() model_predictions = model(**model_batch) loaded_model_predictions = loaded_model(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal( model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance ) return model, loaded_model
def test_train_model_distributed_without_sharded_reader(self, lazy: bool): if torch.cuda.device_count() >= 2: devices = [0, 1] else: devices = [-1, -1] num_epochs = 2 params = lambda: Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 }, }, "dataset_reader": { "type": "sequence_tagging", "lazy": lazy }, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, "data_loader": { "batch_size": 1 }, "trainer": { "num_epochs": num_epochs, "optimizer": "adam", "batch_callbacks": ["tests.commands.train_test.TrainingDataLoggerBatchCallback"], }, "distributed": { "cuda_devices": devices }, }) out_dir = os.path.join(self.TEST_DIR, "test_distributed_train") train_model(params(), serialization_dir=out_dir) # Check that some logs specific to distributed # training are where we expect. serialized_files = os.listdir(out_dir) assert "stderr_worker0.log" in serialized_files assert "stdout_worker0.log" in serialized_files assert "stderr_worker1.log" in serialized_files assert "stdout_worker1.log" in serialized_files assert "model.tar.gz" in serialized_files # Check we can load the serialized model archive = load_archive(out_dir) assert archive.model # Check that we created a vocab from all the shards. tokens = set(archive.model.vocab._token_to_index["tokens"].keys()) assert tokens == { "@@PADDING@@", "@@UNKNOWN@@", "are", ".", "animals", "cats", "dogs", "snakes", "birds", } train_complete = "completed its entire epoch (training)." validation_complete = "completed its entire epoch (validation)." import re pattern = re.compile(r"First word from training data: '([^']*)'") first_word_counts = Counter() with open(os.path.join(out_dir, "stdout_worker0.log")) as f: worker0_log = f.read() assert train_complete in worker0_log assert validation_complete in worker0_log for first_word in pattern.findall(worker0_log): first_word_counts[first_word] += 1 with open(os.path.join(out_dir, "stdout_worker1.log")) as f: worker1_log = f.read() assert train_complete in worker1_log assert validation_complete in worker1_log for first_word in pattern.findall(worker1_log): first_word_counts[first_word] += 1 assert first_word_counts == { "cats": num_epochs, "dogs": num_epochs, "snakes": num_epochs, "birds": num_epochs, }
def ensure_model_can_train_save_and_load(self, param_file: str, tolerance: float = 1e-4, cuda_device: int = -1, gradients_to_ignore: Set[str] = None, overrides: str = ""): """ Parameters ---------- param_file : ``str`` Path to a training configuration file that we will use to train the model for this test. tolerance : ``float``, optional (default=1e-4) When comparing model predictions between the originally-trained model and the model after saving and loading, we will use this tolerance value (passed as ``rtol`` to ``numpy.testing.assert_allclose``). cuda_device : ``int``, optional (default=-1) The device to run the test on. gradients_to_ignore : ``Set[str]``, optional (default=None) This test runs a gradient check to make sure that we're actually computing gradients for all of the parameters in the model. If you really want to ignore certain parameters when doing that check, you can pass their names here. This is not recommended unless you're `really` sure you don't need to have non-zero gradients for those parameters (e.g., some of the beam search / state machine models have infrequently-used parameters that are hard to force the model to use in a small test). overrides : ``str``, optional (default = "") A JSON string that we will use to override values in the input parameter file. """ save_dir = self.TEST_DIR / "save_and_load_test" archive_file = save_dir / "model.tar.gz" model = train_model_from_file(param_file, save_dir, overrides=overrides) loaded_model = load_archive(archive_file, cuda_device=cuda_device).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose(model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key) params = Params.from_file(param_file) reader = DatasetReader.from_params(params['dataset_reader']) # Need to duplicate params because Iterator.from_params will consume. iterator_params = params['iterator'] iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict())) iterator = DataIterator.from_params(iterator_params) iterator2 = DataIterator.from_params(iterator_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_dataset = reader.read(params['validation_data_path']) iterator.index_with(model.vocab) model_batch = next(iterator(model_dataset, shuffle=False)) loaded_dataset = reader.read(params['validation_data_path']) iterator2.index_with(loaded_model.vocab) loaded_batch = next(iterator2(loaded_dataset, shuffle=False)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(model, model_batch, gradients_to_ignore) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, 'stateful') and module.stateful: module.reset_states() model_predictions = model(**model_batch) loaded_model_predictions = loaded_model(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance) return model, loaded_model
base_path = os.path.abspath('') sys.path.append(base_path + "/") import import_folders from squad1_reader import Squad1Reader load_dataset_from_disk = 0 load_pretrained_BiDAF = 1 build_model_from_scratch = 0 """ ################ LOAD PRETRAINED MODEL ############### We load the pretrained model and we can see what parts it has and maybe reused if needed. """ if (load_pretrained_BiDAF): archive = load_archive("https://s3-us-west-2.amazonaws.com/allennlp/models/bidaf-model-2017.09.15-charpad.tar.gz") # Get the model and the config file model = archive.model config = archive.config.duplicate() keys_config = list(config.keys()) print ("Key config list: ", keys_config) for key in keys_config: print ("Params of %s"%(key)) print (config[key]) ### Get the elements ## Data Readers ## dataset_reader_params = config["dataset_reader"] dataset_reader = DatasetReader.from_params(dataset_reader_params) ## Vocabulary ##
def main(file, embeddings, model, emb_wt_key, namespace, output_dir): archive = load_archive(model) config = archive.config os.makedirs(output_dir, exist_ok=True) config.to_file(os.path.join(output_dir, CONFIG_NAME)) model = archive.model # first expand the vocabulary dataset_reader = DatasetReader.from_params(config.pop('dataset_reader')) instances = dataset_reader.read(file) vocab = model.vocab # get all the tokens in the new file namespace_token_counts: Dict[str, Dict[str, int]] = defaultdict( lambda: defaultdict(int)) for instance in Tqdm.tqdm(instances): instance.count_vocab_items(namespace_token_counts) old_token_size = vocab.get_vocab_size(namespace) print("Before expansion: Number of instances in {} namespace: {}".format( namespace, old_token_size)) if namespace not in namespace_token_counts: logger.error( "No tokens found for namespace: {} in the new input file".format( namespace)) # identify the new tokens in the new instances token_to_add = set() token_hits = 0 for token, count in namespace_token_counts[namespace].items(): if token not in vocab._token_to_index[namespace]: # new token, must add token_to_add.add(token) else: token_hits += 1 print("Found {} existing tokens and {} new tokens in {}".format( token_hits, len(token_to_add), file)) # add the new tokens to the vocab for token in token_to_add: vocab.add_token_to_namespace(token=token, namespace=namespace) archived_parameters = dict(model.named_parameters()) # second, expand the embedding matrix for name, weights in archived_parameters.items(): # find the wt matrix for the embeddings if name == emb_wt_key: if weights.dim() != 2: logger.error( "Expected an embedding matrix for the parameter: {} instead" "found {} tensor".format(emb_wt_key, weights.shape)) emb_dim = weights.shape[-1] print("Before expansion: Size of emb matrix: {}".format( weights.shape)) # Loading embeddings for old and new tokens since that is cleaner than copying all # the embedding loading logic here all_embeddings = _read_pretrained_embeddings_file( embeddings, emb_dim, vocab, namespace) # concatenate the new entries i.e last token_to_add embeddings to the original weights if len(token_to_add) > 0: weights.data = torch.cat( [weights.data, all_embeddings[-len(token_to_add):, :]]) print("After expansion: Size of emb matrix: {}".format( weights.shape)) # save the files needed by the model archiver model_path = os.path.join(output_dir, "weight.th") model_state = model.state_dict() torch.save(model_state, model_path) vocab.save_to_files(os.path.join(output_dir, "vocabulary")) archive_model(output_dir, weights="weight.th") # more debug messages new_token_size = vocab.get_vocab_size(namespace) for name, weights in archived_parameters.items(): if name == emb_wt_key: print("Size of emb matrix: {}".format(weights.shape)) print("After expansion: Number of instances in {} namespace: {}".format( namespace, new_token_size))
def eval_model(db: FeverDocDB, args) -> Model: archive = load_archive(args.archive_file, cuda_device=args.cuda_device) config = archive.config ds_params = config["dataset_reader"] model = archive.model model.eval() reader = FEVERReader(db, sentence_level=ds_params.pop("sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=FEVERReader.custom_dict_from_params( ds_params.pop('token_indexers', {})), ner_facts=args.ner_facts) logger.info("Reading training data from %s", args.in_file) data = reader.read(args.in_file) actual = [] predicted = [] if args.log is not None: f = open(args.log, "w+") for item in tqdm(data): if item.fields["premise"] is None or item.fields[ "premise"].sequence_length() == 0: cls = "NOT ENOUGH INFO" else: prediction = model.forward_on_instance(item) cls = model.vocab._index_to_token["labels"][np.argmax( prediction["label_probs"])] if "label" in item.fields: actual.append(item.fields["label"].label) if args.ner_missing is not None: if args.ner_missing == 'oracle' and item.fields[ "label"].label == "NOT ENOUGH INFO" and cls != "NOT ENOUGH INFO": if item.fields["metadata"].metadata["ner_missing"]: cls = "NOT ENOUGH INFO" if args.ner_missing == 'oracle' and item.fields[ "label"].label == "SUPPORTS" and cls != "SUPPORTS": if item.fields["metadata"].metadata["ner_missing"]: cls = "SUPPORTS" if args.ner_missing == 'oracle' and item.fields[ "label"].label == "REFUTES" and cls != "REFUTES": if item.fields["metadata"].metadata["ner_missing"]: cls = "REFUTES" if args.ner_missing == 'naive' and cls == 'SUPPORTS': if item.fields["metadata"].metadata["ner_missing"]: highest = np.argmax(prediction["label_probs"]) lowest = np.argmin(prediction["label_probs"]) copy = [] for pred in prediction["label_probs"]: copy.append(pred) copy[highest] = prediction["label_probs"][lowest] original_logits = prediction["label_logits"][highest] chosen_logits = prediction["label_logits"][np.argmax( copy)] difference_logits = original_logits - chosen_logits if difference_logits < 3.0: cls = model.vocab._index_to_token["labels"][ np.argmax(copy)] predicted.append(cls) if args.log is not None: if "label" in item.fields: f.write( json.dumps({ "actual": item.fields["label"].label, "predicted": cls }) + "\n") else: f.write(json.dumps({"predicted": cls}) + "\n") if args.log is not None: f.close() if len(actual) > 0: print(accuracy_score(actual, predicted)) print(classification_report(actual, predicted)) print(confusion_matrix(actual, predicted)) return model
def _load_predictor(archive_file: str, predictor_name: str) -> Predictor: """ Helper to load the desired predictor from the given archive. """ archive = load_archive(archive_file) return Predictor.from_archive(archive, predictor_name)
optparser.add_argument("--batch_size", type=int, default=None, help="Overwrite batch size.") optparser.add_argument("--parse_on_cpu", action="store_true", default=False, help="Enforce parsing on the CPU.") args = optparser.parse_args() if args.beam < 1: print("Beam size must be at least 1") sys.exit() archive = load_archive(args.archive_file, args.cuda_device) config = archive.config prepare_environment(config) model = archive.model model.eval() model.k_best = args.beam model.parse_on_gpu = not args.parse_on_cpu pipelinepieces = PipelineTrainerPieces.from_params(config) if args.batch_size is not None and args.batch_size > 0: assert isinstance(pipelinepieces.annotator.data_iterator, SameFormalismIterator) iterator: SameFormalismIterator = pipelinepieces.annotator.data_iterator pipelinepieces.annotator.data_iterator = SameFormalismIterator( iterator.formalisms, args.batch_size)
def test_predictor(): question_json = { "id": "1700", "question_tokens": [ "@start@", "For", "what", "does", "a", "stove", "generally", "generate", "heat", "?", "@end@" ], "choice_tokens_list": [["@start@", "warming", "the", "air", "in", "the", "area", "@end@"], [ "@start@", "heating", "nutrients", "to", "appropriate", "temperatures", "@end@" ], [ "@start@", "entertaining", "various", "visitors", "and", "guests", "@end@" ], ["@start@", "to", "create", "electrical", "charges", "@end@"]], "facts_tokens_list": [["@start@", "UML", "can", "generate", "code", "@end@"], ["@start@", "generate", "is", "a", "synonym", "of", "beget", "@end@"], ["@start@", "Heat", "is", "generated", "by", "a", "stove", "@end@"], [ "@start@", "A", "sonnet", "is", "generally", "very", "structured", "@end@" ], [ "@start@", "A", "fundamentalist", "is", "generally", "right", "-", "wing", "@end@" ], ["@start@", "menstruation", "is", "generally", "crampy", "@end@"], [ "@start@", "an", "erection", "is", "generally", "pleasurable", "@end@" ], ["@start@", "gunfire", "is", "generally", "lethal", "@end@"], ["@start@", "ejaculating", "is", "generally", "pleasurable", "@end@"], ["@start@", "Huddersfield", "is", "generally", "urban", "@end@"], [ "@start@", "warming", "is", "a", "synonym", "of", "calefacient", "@end@" ], ["@start@", "heat", "is", "related", "to", "warming", "air", "@end@"], ["@start@", "a", "stove", "is", "for", "warming", "food", "@end@"], [ "@start@", "an", "air", "conditioning", "is", "for", "warming", "@end@" ], ["@start@", "The", "earth", "is", "warming", "@end@"], [ "@start@", "a", "heat", "source", "is", "for", "warming", "up", "@end@" ], ["@start@", "A", "foyer", "is", "an", "enterance", "area", "@end@"], ["@start@", "Being", "nosey", "is", "not", "appropriate", "@end@"], [ "@start@", "seize", "is", "a", "synonym", "of", "appropriate", "@end@" ], [ "@start@", "a", "fitting", "room", "is", "used", "for", "something", "appropriate", "@end@" ], [ "@start@", "appropriate", "is", "a", "synonym", "of", "allow", "@end@" ], [ "@start@", "appropriate", "is", "similar", "to", "befitting", "@end@" ], [ "@start@", "appropriate", "is", "similar", "to", "grade", "-", "appropriate", "@end@" ], [ "@start@", "grade", "-", "appropriate", "is", "similar", "to", "appropriate", "@end@" ], [ "@start@", "A", "parlor", "is", "used", "for", "entertaining", "guests", "@end@" ], [ "@start@", "a", "back", "courtyard", "is", "for", "entertaining", "guests", "@end@" ], ["@start@", "guest", "is", "a", "type", "of", "visitor", "@end@"], [ "@start@", "a", "family", "room", "is", "for", "entertaining", "guests", "@end@" ], [ "@start@", "cooking", "a", "meal", "is", "for", "entertaining", "guests", "@end@" ], [ "@start@", "buying", "a", "house", "is", "for", "entertaining", "guests", "@end@" ], [ "@start@", "having", "a", "party", "is", "for", "entertaining", "guests", "@end@" ], [ "@start@", "a", "dining", "area", "is", "used", "for", "entertaining", "guests", "@end@" ], ["@start@", "visitor", "is", "related", "to", "guest", "@end@"], ["@start@", "guest", "is", "related", "to", "visitor", "@end@"], ["@start@", "Electrical", "charges", "are", "additive", "@end@"], ["@start@", "Lightning", "is", "an", "electrical", "charge", "@end@"], ["@start@", "electrons", "have", "electrical", "charge", "@end@"], [ "@start@", "A", "judge", "is", "in", "charge", "in", "a", "courtroom", "@end@" ], [ "@start@", "charge", "is", "a", "synonym", "of", "accusation", "@end@" ], [ "@start@", "A", "consultant", "can", "charge", "a", "fee", "to", "a", "client", "@end@" ], [ "@start@", "charge", "is", "a", "synonym", "of", "commission", "@end@" ], [ "@start@", "charge", "is", "a", "synonym", "of", "cathexis", "@end@" ], ["@start@", "charge", "is", "not", "cash", "@end@"], ["@start@", "arraign", "entails", "charge", "@end@"], [ "@start@", "a", "stove", "generates", "heat", "for", "cooking", "usually", "@end@" ], [ "@start@", "preferences", "are", "generally", "learned", "characteristics", "@end@" ], [ "@start@", "a", "windmill", "does", "not", "create", "pollution", "@end@" ], [ "@start@", "temperature", "is", "a", "measure", "of", "heat", "energy", "@end@" ], [ "@start@", "a", "hot", "something", "is", "a", "source", "of", "heat", "@end@" ], [ "@start@", "the", "moon", "does", "not", "contain", "water", "@end@" ], ["@start@", "sunlight", "produces", "heat", "@end@"], ["@start@", "an", "oven", "is", "a", "source", "of", "heat", "@end@"], [ "@start@", "a", "hot", "substance", "is", "a", "source", "of", "heat", "@end@" ], [ "@start@", "a", "car", "engine", "is", "a", "source", "of", "heat", "@end@" ], [ "@start@", "as", "the", "amount", "of", "rainfall", "increases", "in", "an", "area", ",", "the", "amount", "of", "available", "water", "in", "that", "area", "will", "increase", "@end@" ], ["@start@", "sound", "can", "travel", "through", "air", "@end@"], [ "@start@", "the", "greenhouse", "effect", "is", "when", "carbon", "in", "the", "air", "heats", "a", "planet", "'s", "atmosphere", "@end@" ], [ "@start@", "a", "community", "is", "made", "of", "many", "types", "of", "organisms", "in", "an", "area", "@end@" ], ["@start@", "air", "is", "a", "vehicle", "for", "sound", "@end@"], [ "@start@", "rainfall", "is", "the", "amount", "of", "rain", "an", "area", "receives", "@end@" ], [ "@start@", "an", "animal", "requires", "air", "for", "survival", "@end@" ], [ "@start@", "humidity", "is", "the", "amount", "of", "water", "vapor", "in", "the", "air", "@end@" ], [ "@start@", "if", "some", "nutrients", "are", "in", "the", "soil", "then", "those", "nutrients", "are", "in", "the", "food", "chain", "@end@" ], [ "@start@", "as", "heat", "is", "transferred", "from", "something", "to", "something", "else", ",", "the", "temperature", "of", "that", "something", "will", "decrease", "@end@" ], ["@start@", "uneven", "heating", "causes", "convection", "@end@"], [ "@start@", "as", "temperature", "during", "the", "day", "increases", ",", "the", "temperature", "in", "an", "environment", "will", "increase", "@end@" ], [ "@start@", "uneven", "heating", "of", "the", "Earth", "'s", "surface", "cause", "wind", "@end@" ], [ "@start@", "an", "animal", "needs", "to", "eat", "food", "for", "nutrients", "@end@" ], [ "@start@", "soil", "contains", "nutrients", "for", "plants", "@end@" ], [ "@start@", "if", "two", "objects", "have", "the", "same", "charge", "then", "those", "two", "materials", "will", "repel", "each", "other", "@end@" ], ["@start@", "water", "is", "an", "electrical", "conductor", "@end@"], [ "@start@", "a", "battery", "is", "a", "source", "of", "electrical", "energy", "@end@" ], [ "@start@", "metal", "is", "an", "electrical", "energy", "conductor", "@end@" ], [ "@start@", "when", "an", "electrical", "circuit", "is", "working", "properly", ",", "electrical", "current", "runs", "through", "the", "wires", "in", "that", "circuit", "@end@" ], ["@start@", "brick", "is", "an", "electrical", "insulator", "@end@"], [ "@start@", "wood", "is", "an", "electrical", "energy", "insulator", "@end@" ], [ "@start@", "a", "toaster", "converts", "electrical", "energy", "into", "heat", "energy", "for", "toasting", "@end@" ]], "gold_label": 1, "gold_facts": { "fact1": "a stove generates heat for cooking usually", "fact2": "cooking involves heating nutrients to higher temperatures" }, "label_probs": [ 0.002615198493003845, 0.9686304330825806, 0.008927381597459316, 0.01982697658240795 ], "label_ranks": [3, 0, 2, 1], "predicted_label": 1, } inputs = question_to_predictor_input(question_json) inputs = predictor_input_to_pred_input_with_full_question_text(inputs) print(json.dumps(inputs, indent=4)) archive = load_archive('_trained_models/model_CN5_1202.tar.gz') predictor = Predictor.from_archive(archive, 'predictor-qa-mc-with-know-visualize') result = predictor.predict_json(inputs) print(result)
def eval_model(db: FeverDocDB, args) -> Model: archive = load_archive(args.archive_file, cuda_device=args.cuda_device, overrides=args.overrides) config = archive.config ds_params = config["dataset_reader"] model = archive.model model.eval() reader = FEVERReader(db, sentence_level=ds_params.pop("sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=TokenIndexer.dict_from_params( ds_params.pop('token_indexers', {}))) while True: claim = input("enter claim (or q to quit) >>") if claim.lower() == "q": break ranker = retriever.get_class('tfidf')(tfidf_path=args.model) p_lines = [] pages, _ = ranker.closest_docs(claim, 5) for page in pages: lines = db.get_doc_lines(page) lines = [ line.split("\t")[1] if len(line.split("\t")[1]) > 1 else "" for line in lines.split("\n") ] p_lines.extend(zip(lines, [page] * len(lines), range(len(lines)))) scores = tf_idf_sim(claim, [pl[0] for pl in p_lines]) scores = list( zip(scores, [pl[1] for pl in p_lines], [pl[2] for pl in p_lines], [pl[0] for pl in p_lines])) scores = list(filter(lambda score: len(score[3].strip()), scores)) sentences_l = list( sorted(scores, reverse=True, key=lambda elem: elem[0])) sentences = [s[3] for s in sentences_l[:5]] evidence = " ".join(sentences) print("Best pages: {0}".format(repr(pages))) print("Evidence:") for idx, sentence in enumerate(sentences_l[:5]): print("{0}\t{1}\t\t{2}\t{3}".format(idx + 1, sentence[0], sentence[1], sentence[3])) item = reader.text_to_instance(evidence, claim) prediction = model.forward_on_instance(item, args.cuda_device) cls = model.vocab._index_to_token["labels"][np.argmax( prediction["label_probs"])] print("PREDICTED: {0}".format(cls)) print()
def __init__(self, vocab: Vocabulary, token_representation_dim: int, encoder: Optional[Seq2SeqEncoder] = None, decoder: Optional[Union[FeedForward, str]] = None, contextualizer: Optional[Contextualizer] = None, pretrained_file: Optional[str] = None, transfer_contextualizer_from_pretrained_file: bool = False, transfer_encoder_from_pretrained_file: bool = False, freeze_encoder: bool = False, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(SelectiveRegressor, self).__init__(vocab, regularizer) self._token_representation_dim = token_representation_dim self._contextualizer = contextualizer if encoder is None: encoder = PassThroughEncoder( input_dim=self._token_representation_dim) self._encoder = encoder # Load the contextualizer and encoder weights from the # pretrained_file if applicable if pretrained_file: archive = None if self._contextualizer and transfer_contextualizer_from_pretrained_file: logger.info("Attempting to load contextualizer weights from " "pretrained_file at {}".format(pretrained_file)) archive = load_archive(cached_path(pretrained_file)) contextualizer_state = archive.model._contextualizer.state_dict( ) contextualizer_layer_num = self._contextualizer._layer_num self._contextualizer.load_state_dict(contextualizer_state) if contextualizer_layer_num is not None: logger.info("Setting layer num to {}".format( contextualizer_layer_num)) self._contextualizer.set_layer_num( contextualizer_layer_num) else: self._contextualizer.reset_layer_num() logger.info("Successfully loaded contextualizer weights!") if transfer_encoder_from_pretrained_file: logger.info("Attempting to load encoder weights from " "pretrained_file at {}".format(pretrained_file)) if archive is None: archive = load_archive(cached_path(pretrained_file)) encoder_state = archive.model._encoder.state_dict() self._encoder.load_state_dict(encoder_state) logger.info("Successfully loaded encoder weights!") self._freeze_encoder = freeze_encoder for parameter in self._encoder.parameters(): # If freeze is true, requires_grad should be false and vice versa. parameter.requires_grad_(not self._freeze_encoder) if decoder is None or decoder == "linear": # Create the default decoder (logistic regression) if it is not provided. decoder = FeedForward.from_params( Params({ "input_dim": self._encoder.get_output_dim(), "num_layers": 1, "hidden_dims": 1, "activations": "linear" })) logger.info("No decoder provided to model, using default " "decoder: {}".format(decoder)) elif decoder == "mlp": # Create the MLP decoder decoder = FeedForward.from_params( Params({ "input_dim": self._encoder.get_output_dim(), "num_layers": 2, "hidden_dims": [1024, 1], "activations": ["relu", "linear"] })) logger.info("Using MLP decoder: {}".format(decoder)) self._decoder = decoder check_dimensions_match(self._token_representation_dim, self._encoder.get_input_dim(), "token representation dim", "encoder input dim") check_dimensions_match(self._encoder.get_output_dim(), self._decoder.get_input_dim(), "encoder output dim", "decoder input dim") check_dimensions_match(self._decoder.get_output_dim(), 1, "decoder output dim", "1, since we're predicting a real value") # SmoothL1Loss as described in "Neural Models of Factuality" (NAACL 2018) self.loss = torch.nn.SmoothL1Loss(reduction="none") self.metrics = { "mae": MeanAbsoluteError(), "pearson_r": PearsonCorrelation() } # Whether to run in error analysis mode or not, see commands.error_analysis self.error_analysis = False logger.info("Applying initializer...") initializer(self)
def test_train_model_distributed(self): if torch.cuda.device_count() >= 2: devices = [0, 1] else: devices = [-1, -1] params = lambda: Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 }, }, "dataset_reader": { "type": "sequence_tagging" }, "train_data_path": SEQUENCE_TAGGING_DATA_PATH, "validation_data_path": SEQUENCE_TAGGING_DATA_PATH, "data_loader": { "batch_size": 2 }, "trainer": { "num_epochs": 2, "optimizer": "adam", # Need to use the fully qualified name here so the distributed workers # can import it. "callbacks": ["tests.commands.train_test.TrainingPrimaryCheckCallback"], }, "distributed": { "cuda_devices": devices }, }) out_dir = os.path.join(self.TEST_DIR, "test_distributed_train") train_model(params(), serialization_dir=out_dir) # Check that some logs specific to distributed # training are where we expect. serialized_files = os.listdir(out_dir) assert "out_worker0.log" in serialized_files assert "out_worker1.log" in serialized_files assert "model.tar.gz" in serialized_files assert "metrics.json" in serialized_files # Make sure the metrics look right. with open(os.path.join(out_dir, "metrics.json")) as f: metrics = json.load(f) assert metrics["peak_worker_0_memory_MB"] > 0 assert metrics["peak_worker_1_memory_MB"] > 0 if torch.cuda.device_count() >= 2: assert metrics["peak_gpu_0_memory_MB"] > 0 assert metrics["peak_gpu_1_memory_MB"] > 0 # Check we can load the serialized model assert load_archive(out_dir).model
from allennlp.common.util import JsonDict, import_submodules from allennlp.data import Instance from allennlp.predictors.predictor import Predictor @Predictor.register('bistm_crf_predictor', exist_ok=True) class bistm_crf_predictor(Predictor): def predict_json(self, inputs: JsonDict): abstract = inputs['abstract'] return self.predict_line(abstract) def predict_line(self, line: str): instance = self._dataset_reader.text_to_instance(line) ouput_dict = self.predict_instance(instance) # return {'predict_title': ouput_dict['predict_title']} return {'input': line, 'predict_title': ouput_dict['predict_title']} if __name__ == '__main__': import_submodules('using_allennlp') serialization_dir = "/home/liangjiaxi/TMP_PROJECT/pingan_event_extraction/tmp/debugger_train" archive = load_archive(os.path.join(serialization_dir, 'model.tar.gz')) predictor = Predictor.from_archive( archive, 'bistm_crf_predictor', dataset_reader_to_load="zhaiyao_datareader") line = '东土科技(300353)公告,公司此前曾披露,控股股东、实控人、董事长李平拟于2017年10月23日起12个月内增持不低于1亿元,累计增持比例不超本公司已发行股份的2%。李平于2018年1月31日至2月8日增持212.68万股,增持资金2431万元。由于相关融资增持监管政策变化导致无法筹措增持资金,李平现申请终止履行未实施部分的增持计划。' a = predictor.predict_line(line) print(a)
def main(): # Load SNLI dataset single_id_indexer = SingleIdTokenIndexer(lowercase_tokens=True) # word tokenizer tokenizer = WordTokenizer(end_tokens=["@@NULL@@"]) # add @@NULL@@ to the end of sentences reader = SnliReader(token_indexers={'tokens': single_id_indexer}, tokenizer=tokenizer) dev_dataset = reader.read('https://s3-us-west-2.amazonaws.com/allennlp/datasets/snli/snli_1.0_dev.jsonl') # Load model and vocab model = load_archive('https://allennlp.s3-us-west-2.amazonaws.com/models/esim-glove-snli-2019.04.23.tar.gz').model model.eval().cuda() vocab = model.vocab # add hooks for embeddings so we can compute gradients w.r.t. to the input tokens utils.add_hooks(model) embedding_weight = utils.get_embedding_weight(model) # save the word embedding matrix # Batches of examples to construct triggers universal_perturb_batch_size = 32 iterator = BasicIterator(batch_size=universal_perturb_batch_size) iterator.index_with(vocab) # Subsample the dataset to one class to do a universal attack on that class dataset_label_filter = 'entailment' # only entailment examples # dataset_label_filter = 'contradiction' # only contradiction examples # dataset_label_filter = 'neutral' # only neutral examples subset_dev_dataset = [] for instance in dev_dataset: if instance['label'].label == dataset_label_filter: subset_dev_dataset.append(instance) # the attack is targeted towards a specific class # target_label = "0" # flip to entailment target_label = "1" # flip to contradiction # target_label = "2" # flip to neutral # A k-d tree if you want to do gradient + nearest neighbors #tree = KDTree(embedding_weight.numpy()) # Get original accuracy before adding universal triggers utils.get_accuracy(model, subset_dev_dataset, vocab, trigger_token_ids=None, snli=True) model.train() # rnn cannot do backwards in train mode # Initialize triggers num_trigger_tokens = 1 # one token prepended trigger_token_ids = [vocab.get_token_index("a")] * num_trigger_tokens # sample batches, update the triggers, and repeat for batch in lazy_groups_of(iterator(subset_dev_dataset, num_epochs=10, shuffle=True), group_size=1): # get model accuracy with current triggers utils.get_accuracy(model, subset_dev_dataset, vocab, trigger_token_ids, snli=True) model.train() # rnn cannot do backwards in train mode # get grad of triggers averaged_grad = utils.get_average_grad(model, batch, trigger_token_ids, target_label, snli=True) # find attack candidates using an attack method cand_trigger_token_ids = attacks.hotflip_attack(averaged_grad, embedding_weight, trigger_token_ids, num_candidates=40) # cand_trigger_token_ids = attacks.random_attack(embedding_weight, # trigger_token_ids, # num_candidates=40) # cand_trigger_token_ids = attacks.nearest_neighbor_grad(averaged_grad, # embedding_weight, # trigger_token_ids, # tree, # 100, # decrease_prob=True) # query the model to get the best candidates trigger_token_ids = utils.get_best_candidates(model, batch, trigger_token_ids, cand_trigger_token_ids, snli=True)
import os from allennlp.common.util import import_submodules from allennlp.models import load_archive from allennlp.predictors import Predictor import_submodules('telegram_classifier') archive_folder = 'trained-lstm/20190802-232055-less_regularized' archive = load_archive(os.path.join(archive_folder, 'model.tar.gz')) predictor = Predictor.from_archive(archive, 'roommate_pred') while True: text = input("Phrase? ") output = predictor.predict_json({"text": text}) print("Sender:", output['label']) for i in range(archive.model.vocab.get_vocab_size('labels')): labelName = archive.model.vocab.get_token_from_index(i, 'labels') percent = output['probs'][i] * 100.0 print(f' {labelName}: {percent:.1f}%')
def ensure_model_can_train_save_and_load(self, param_file: str, tolerance: float = 1e-4, cuda_device: int = -1): save_dir = os.path.join(self.TEST_DIR, "save_and_load_test") archive_file = os.path.join(save_dir, "model.tar.gz") model = train_model_from_file(param_file, save_dir) loaded_model = load_archive(archive_file, cuda_device=cuda_device).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose(model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key) params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) # Need to duplicate params because Iterator.from_params will consume. iterator_params = params['iterator'] iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict())) iterator = DataIterator.from_params(iterator_params) iterator2 = DataIterator.from_params(iterator_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_dataset = reader.read(params['validation_data_path']) iterator.index_with(model.vocab) model_batch = next( iterator(model_dataset, shuffle=False, cuda_device=cuda_device)) loaded_dataset = reader.read(params['validation_data_path']) iterator2.index_with(loaded_model.vocab) loaded_batch = next( iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(model, model_batch) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, 'stateful') and module.stateful: module.reset_states() model_predictions = model(**model_batch) loaded_model_predictions = loaded_model(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance) return model, loaded_model
# update: the feverdatareader we are using from the fever code needs the name of trained model. EVen for training. wtf.. # update: so moved it to outside this for loop, since we are accessing it only once using uofa_params.pop anyway #step 3 -read data objUofaTrainTest = UofaTrainTest() if (run_name == "annotation" and dataset == "fnc"): path_to_trained_models = path_to_trained_models_folder + name_of_trained_model_to_use convert_fnc_to_fever_and_annotate( FeverDocDB, path_to_trained_models, mithun_logger, cuda_device, path_to_pyproc_annotated_data_folder) db = FeverDocDB(path_to_saved_db) archive = load_archive( path_to_trained_models_folder + name_of_trained_model_to_use, cuda_device) config = archive.config ds_params = config["dataset_reader"] model = archive.model model.eval() mithun_logger.info(f"going to initiate FEVERReaderUofa.") fever_reader = FEVERReaderUofa( db, sentence_level=ds_params.pop("sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=TokenIndexer.dict_from_params( ds_params.pop('token_indexers', {})))
def test_train_model_distributed_with_sharded_reader(self, lazy): if torch.cuda.device_count() >= 2: devices = [0, 1] else: devices = [-1, -1] params = lambda: Params({ "model": { "type": "simple_tagger", "text_field_embedder": { "token_embedders": { "tokens": { "type": "embedding", "embedding_dim": 5 } } }, "encoder": { "type": "lstm", "input_size": 5, "hidden_size": 7, "num_layers": 2 }, }, "dataset_reader": { "type": "sharded", "base_reader": { "type": "sequence_tagging" }, "lazy": lazy, }, "train_data_path": SEQUENCE_TAGGING_SHARDS_PATH, "validation_data_path": SEQUENCE_TAGGING_SHARDS_PATH, "data_loader": { "batch_size": 2 }, "trainer": { "num_epochs": 2, "optimizer": "adam" }, "distributed": { "cuda_devices": devices }, }) out_dir = os.path.join(self.TEST_DIR, "test_distributed_train") train_model(params(), serialization_dir=out_dir) # Check that some logs specific to distributed # training are where we expect. serialized_files = os.listdir(out_dir) assert "stderr_worker0.log" in serialized_files assert "stdout_worker0.log" in serialized_files assert "stderr_worker1.log" in serialized_files assert "stdout_worker1.log" in serialized_files assert "model.tar.gz" in serialized_files # Check we can load the serialized model archive = load_archive(out_dir) assert archive.model # Check that we created a vocab from all the shards. tokens = archive.model.vocab._token_to_index["tokens"].keys() assert tokens == { "@@PADDING@@", "@@UNKNOWN@@", "are", ".", "animals", "plants", "vehicles", "cats", "dogs", "snakes", "birds", "ferns", "trees", "flowers", "vegetables", "cars", "buses", "planes", "rockets", } # TODO: This is somewhat brittle. Make these constants in trainer.py. train_early = "finishing training early!" validation_early = "finishing validation early!" train_complete = "completed its entire epoch (training)." validation_complete = "completed its entire epoch (validation)." # There are three shards, but only two workers, so the first worker will have to discard some data. with open(os.path.join(out_dir, "stdout_worker0.log")) as f: worker0_log = f.read() assert train_early in worker0_log assert validation_early in worker0_log assert train_complete not in worker0_log assert validation_complete not in worker0_log with open(os.path.join(out_dir, "stdout_worker1.log")) as f: worker1_log = f.read() assert train_early not in worker1_log assert validation_early not in worker1_log assert train_complete in worker1_log assert validation_complete in worker1_log
def main(): # Read the SQuAD validation dataset using a word tokenizer single_id = SingleIdTokenIndexer(lowercase_tokens=True) reader = SquadReader(token_indexers={'tokens': single_id}) dev_dataset = reader.read( 'https://s3-us-west-2.amazonaws.com/allennlp/datasets/squad/squad-dev-v1.1.json' ) # Load the model and its associated vocabulary. model = load_archive( 'https://s3-us-west-2.amazonaws.com/allennlp/models/bidaf-glove-2019.05.09.tar.gz' ).model vocab = model.vocab model.eval().cuda() # filter to just certain `wh` questions who_questions_dev, what_questions_dev, where_questions_dev, when_questions_dev, what_questions_dev, \ how_questions_dev, why_questions_dev, which_questions_dev, other_questions_dev = ([] for i in range(9)) for item in dev_dataset: for word in item['question']: if word.text.lower() == 'who': who_questions_dev.append(item) break if word.text.lower() == 'what': what_questions_dev.append(item) break if word.text.lower() == 'where': where_questions_dev.append(item) break if word.text.lower() == 'when': when_questions_dev.append(item) break if word.text.lower() == 'how': how_questions_dev.append(item) break if word.text.lower() == 'why': why_questions_dev.append(item) break if word.text.lower() == 'which': which_questions_dev.append(item) break else: other_questions_dev.append(item) # Use batches to craft the universal perturbations universal_perturb_batch_size = 32 iterator = BasicIterator(batch_size=universal_perturb_batch_size) iterator.index_with(vocab) # We register a gradient hook on the embeddings. utils.add_hooks(model) embedding_weight = utils.get_embedding_weight( model) # save the word embedding matrix # Initialize the trigger. The first one is an intialization with all "the" tokens. # You can customize it. Make sure to set the fixed target answer and the question type. # The second is a trigger found after running as reported in our paper. trigger_init = "the the the the donald trump the the the the" target_answer = "donald trump" subsampled_dev_dataset = who_questions_dev # universal attack on `who` questions # trigger_init = "why how ; known because : to kill american people ." # target_answer = "to kill american people" # subsampled_dev_dataset = why_questions_dev # universal attack on `who` questions # tokenizes the trigger, and finds the start/end span # make sure the trigger tokens are space separated trigger_token_ids = [ vocab.get_token_index(t) for t in trigger_init.split(' ') ] span_start = trigger_init.split(' ').index( target_answer.split(' ')[0]) # start of target_answer span_end = trigger_init.split(' ').index(target_answer.split(' ')[-1]) # we ignore replacement at the positions of the answer (answer is fixed) ignore_indices = [0]*(span_start) + \ [1]*(span_end - span_start + 1) + [0]*(len(trigger_token_ids) - 1 - span_end) # if these parameters are bigger = better result, but slower num_candidates = 20 beam_size = 5 for _ in range(100): # Get targeted accuracy squad_utils.get_accuracy_squad(model, subsampled_dev_dataset, vocab, trigger_token_ids, target_answer, span_start, span_end) model.train() # Get the gradient for the appended tokens averaged over the batch. averaged_grad = squad_utils.get_average_grad_squad( model, vocab, trigger_token_ids, subsampled_dev_dataset, span_start, span_end) # Use an attack method to get the top candidates cand_trigger_token_ids = attacks.hotflip_attack( averaged_grad, embedding_weight, trigger_token_ids, num_candidates=num_candidates, increase_loss=False) # Query the model with the top candidates to find the best tokens. trigger_token_ids = squad_utils.get_best_candidates_squad( model, trigger_token_ids, cand_trigger_token_ids, vocab, subsampled_dev_dataset, beam_size, ignore_indices, span_start, span_end)
def ensure_model_can_train_save_and_load(self, param_file: str, tolerance: float = 1e-4, cuda_device: int = -1): save_dir = os.path.join(self.TEST_DIR, "save_and_load_test") archive_file = os.path.join(save_dir, "model.tar.gz") model = train_model_from_file(param_file, save_dir) loaded_model = load_archive(archive_file, cuda_device=cuda_device).model state_keys = model.state_dict().keys() loaded_state_keys = loaded_model.state_dict().keys() assert state_keys == loaded_state_keys # First we make sure that the state dict (the parameters) are the same for both models. for key in state_keys: assert_allclose(model.state_dict()[key].cpu().numpy(), loaded_model.state_dict()[key].cpu().numpy(), err_msg=key) params = Params.from_file(self.param_file) reader = DatasetReader.from_params(params['dataset_reader']) # Need to duplicate params because Iterator.from_params will consume. iterator_params = params['iterator'] iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict())) iterator = DataIterator.from_params(iterator_params) iterator2 = DataIterator.from_params(iterator_params2) # We'll check that even if we index the dataset with each model separately, we still get # the same result out. model_dataset = reader.read(params['validation_data_path']) iterator.index_with(model.vocab) model_batch = next(iterator(model_dataset, shuffle=False, cuda_device=cuda_device)) loaded_dataset = reader.read(params['validation_data_path']) iterator2.index_with(loaded_model.vocab) loaded_batch = next(iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device)) # Check gradients are None for non-trainable parameters and check that # trainable parameters receive some gradient if they are trainable. self.check_model_computes_gradients_correctly(model, model_batch) # The datasets themselves should be identical. assert model_batch.keys() == loaded_batch.keys() for key in model_batch.keys(): self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6) # Set eval mode, to turn off things like dropout, then get predictions. model.eval() loaded_model.eval() # Models with stateful RNNs need their states reset to have consistent # behavior after loading. for model_ in [model, loaded_model]: for module in model_.modules(): if hasattr(module, 'stateful') and module.stateful: module.reset_states() model_predictions = model(**model_batch) loaded_model_predictions = loaded_model(**loaded_batch) # Check loaded model's loss exists and we can compute gradients, for continuing training. loaded_model_loss = loaded_model_predictions["loss"] assert loaded_model_loss is not None loaded_model_loss.backward() # Both outputs should have the same keys and the values for these keys should be close. for key in model_predictions.keys(): self.assert_fields_equal(model_predictions[key], loaded_model_predictions[key], name=key, tolerance=tolerance) return model, loaded_model
from allenpipeline.Decoder import split_up import allennlp.nn.util as util if __name__ == "__main__": import_submodules("topdown_parser") from topdown_parser.dataset_readers.same_formalism_iterator import SameFormalismIterator optparser = argparse.ArgumentParser( add_help=True, description="Count trainable parameters.") optparser.add_argument('archive_file', type=str, help='the archived model to make predictions with') args = optparser.parse_args() archive = load_archive(args.archive_file) config = archive.config prepare_environment(config) model = archive.model total_params = 0 # for module in model.modules(): # params = module.parameters() # print(module, sum(p.numel() for p in params if p.requires_grad)) for p in model.parameters(): if p.requires_grad: total_params += p.numel() print(round(total_params / 1_000_000, 2), "M", "parameters")