def get_bert_test_fixture(): embedder_params = { "type": "bert-pretrained", "pretrained_model": "tests/fixtures/bert/bert_test_fixture.tar.gz", "requires_grad": True, "top_layer_only": True, } embedder_params_copy = dict(embedder_params) embedder = TokenEmbedder.from_params(Params(embedder_params)) indexer_params = { "type": "bert-pretrained", "pretrained_model": "tests/fixtures/bert/vocab.txt", "do_lowercase": True, "use_starting_offsets": True, "max_pieces": 512, } indexer_params_copy = dict(indexer_params) indexer = TokenIndexer.from_params(Params(indexer_params)) return { 'embedder': embedder, 'embedder_params': embedder_params_copy, 'indexer': indexer, 'indexer_params': indexer_params_copy }
def compile_featurizer(self, tokenizer: Tokenizer) -> InputFeaturizer: """Creates the featurizer based on the configured input features :::tip If you are creating configurations programmatically use this method to check that you provided a valid configuration. ::: Parameters ---------- tokenizer Tokenizer used for this featurizer Returns ------- featurizer The configured `InputFeaturizer` """ configuration = self._make_allennlp_config() indexer = { feature_namespace: TokenIndexer.from_params(Params(config["indexer"])) for feature_namespace, config in configuration.items() } return InputFeaturizer(tokenizer, indexer=indexer)
def from_params(cls, params: Params): """ Parameters ---------- squad_filename : ``str`` negative_sentence_selection : ``str``, optional (default=``"paragraph"``) tokenizer : ``Params``, optional token_indexers: ``List[Params]``, optional """ negative_sentence_selection = params.pop('negative_sentence_selection', 'paragraph') tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = {} token_indexer_params = params.pop('token_indexers', Params({})) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, # so if no parameters are given we must pass None. if token_indexers == {}: token_indexers = None params.assert_empty(cls.__name__) return SquadSentenceSelectionReader( negative_sentence_selection=negative_sentence_selection, tokenizer=tokenizer, token_indexers=token_indexers)
def eval_model(db: FeverDocDB, args) -> Model: archive = load_archive(args.archive_file, cuda_device=args.cuda_device) config = archive.config ds_params = config["dataset_reader"] model = archive.model model.eval() reader = FEVERReader(db, sentence_level=ds_params.pop("sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=TokenIndexer.dict_from_params( ds_params.pop('token_indexers', {}))) logger.info("Reading training data from %s", args.in_file) data = reader.read(args.in_file).instances actual = [] predicted = [] if args.log is not None: f = open(args.log, "w+") for item in tqdm(data): if item.fields["premise"] is None or item.fields[ "premise"].sequence_length() == 0: cls = "NOT ENOUGH INFO" else: prediction = model.forward_on_instance(item, args.cuda_device) cls = model.vocab._index_to_token["labels"][np.argmax( prediction["label_probs"])] if "label" in item.fields: actual.append(item.fields["label"].label) predicted.append(cls) if args.log is not None: if "label" in item.fields: f.write( json.dumps({ "actual": item.fields["label"].label, "predicted": cls }) + "\n") else: f.write(json.dumps({"predicted": cls}) + "\n") if args.log is not None: f.close() if len(actual) > 0: print(accuracy_score(actual, predicted)) print(classification_report(actual, predicted)) print(confusion_matrix(actual, predicted)) return model
def _get_entity_indexers(): indexer_params = Params({ "type": "characters_tokenizer", "tokenizer": { "type": "word", "word_splitter": { "type": "just_spaces" }, }, "namespace": "entity" }) return {'wordnet': TokenIndexer.from_params(indexer_params)}
def _get_indexer(namespace): return TokenIndexer.from_params( Params({ "type": "characters_tokenizer", "tokenizer": { "type": "word", "word_splitter": { "type": "just_spaces" }, }, "namespace": namespace }))
def from_params(cls, params: Params): """ Parameters ---------- token_indexers: ``Dict[Params]``, optional """ token_indexers = {} token_indexer_params = params.pop('token_indexers', Params({})) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, # so if no parameters are given we must pass None. if token_indexers == {}: token_indexers = None params.assert_empty(cls.__name__) return SequenceTaggingDatasetReader(token_indexers=token_indexers)
def token_indexer_dict_from_params( params: Params) -> 'Dict[str, TokenIndexer]': # type: ignore """ We typically use ``TokenIndexers`` in a dictionary, with each ``TokenIndexer`` getting a name. The specification for this in a ``Params`` object is typically ``{"name" -> {indexer_params}}``. This method reads that whole set of parameters and returns a dictionary suitable for use in a ``TextField``. Because default values for token indexers are typically handled in the calling class to this and are based on checking for ``None``, if there were no parameters specifying any token indexers in the given ``params``, we return ``None`` instead of an empty dictionary. """ token_indexers = {} for name, indexer_params in params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) if token_indexers == {}: token_indexers = None return token_indexers
def from_params(cls, params: Params): """ Parameters ---------- filename : ``str`` tokenizer : ``Params``, optional token_indexers: ``List[Params]``, optional """ tokenizer = Tokenizer.from_params(params.pop('tokenizer', {})) token_indexers = {} token_indexer_params = params.pop('token_indexers', Params({})) for name, indexer_params in token_indexer_params.items(): token_indexers[name] = TokenIndexer.from_params(indexer_params) # The default parameters are contained within the class, # so if no parameters are given we must pass None. if token_indexers == {}: token_indexers = None params.assert_empty(cls.__name__) return SnliReader(tokenizer=tokenizer, token_indexers=token_indexers)
def test_token_characters_indexer_tokenizer(self): params = Params({ "type": "characters_tokenizer", "tokenizer": { "type": "word", "word_splitter": { "type": "just_spaces" }, }, "namespace": "tok" }) indexer = TokenIndexer.from_params(params) vocab = Vocabulary() vocab.add_token_to_namespace("the", namespace="tok") vocab.add_token_to_namespace("2", namespace="tok") indices = indexer.tokens_to_indices( [Token(t) for t in "the 2 .".split()], vocab, 'a') self.assertListEqual(indices['a'], [[2], [3], [1]])
def train_model(db: FeverDocDB, params: Union[Params, Dict[str, Any]], cuda_device: int, serialization_dir: str, filtering: str) -> Model: """ This function can be used as an entry point to running models in AllenNLP directly from a JSON specification using a :class:`Driver`. Note that if you care about reproducibility, you should avoid running code using Pytorch or numpy which affect the reproducibility of your experiment before you import and use this function, these libraries rely on random seeds which can be set in this function via a JSON specification file. Note that this function performs training and will also evaluate the trained model on development and test sets if provided in the parameter json. Parameters ---------- params: Params, required. A parameter object specifying an AllenNLP Experiment. serialization_dir: str, required The directory in which to save results and logs. """ SimpleRandom.set_seeds() os.makedirs(serialization_dir, exist_ok=True) sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"), sys.stdout) # type: ignore sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"), sys.stderr) # type: ignore handler = logging.FileHandler( os.path.join(serialization_dir, "python_logging.log")) handler.setLevel(logging.INFO) handler.setFormatter( logging.Formatter( '%(asctime)s - %(levelname)s - %(name)s - %(message)s')) logging.getLogger().addHandler(handler) serialization_params = deepcopy(params).as_dict(quiet=True) with open(os.path.join(serialization_dir, "model_params.json"), "w") as param_file: json.dump(serialization_params, param_file, indent=4) # Now we begin assembling the required parts for the Trainer. ds_params = params.pop('dataset_reader', {}) dataset_reader = FEVERReader(db, sentence_level=ds_params.pop( "sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=TokenIndexer.dict_from_params( ds_params.pop('token_indexers', {})), filtering=filtering) train_data_path = params.pop('train_data_path') logger.info("Reading training data from %s", train_data_path) train_data = dataset_reader.read(train_data_path) all_datasets = [train_data] datasets_in_vocab = ["train"] validation_data_path = params.pop('validation_data_path', None) if validation_data_path is not None: logger.info("Reading validation data from %s", validation_data_path) validation_data = dataset_reader.read(validation_data_path) all_datasets.append(validation_data) datasets_in_vocab.append("validation") else: validation_data = None logger.info("Creating a vocabulary using %s data.", ", ".join(datasets_in_vocab)) vocab = Vocabulary.from_params( params.pop("vocabulary", {}), Dataset([ instance for dataset in all_datasets for instance in dataset.instances ])) vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) model = Model.from_params(vocab, params.pop('model')) iterator = DataIterator.from_params(params.pop("iterator")) train_data.index_instances(vocab) if validation_data: validation_data.index_instances(vocab) trainer_params = params.pop("trainer") if cuda_device is not None: trainer_params["cuda_device"] = cuda_device trainer = Trainer.from_params(model, serialization_dir, iterator, train_data, validation_data, trainer_params) trainer.train() # Now tar up results archive_model(serialization_dir) return model
def __init__( self, archive_file: str, dropout: float = None, bos_eos_tokens: Tuple[str, str] = ("<S>", "</S>"), remove_bos_eos: bool = True, requires_grad: bool = False, ) -> None: super().__init__() overrides = {"model": {"contextualizer": {"return_all_layers": True}}} # Import here to avoid circular dependency. from allennlp.models.archival import load_archive # Load LM and the associated config. archive = load_archive(archive_file, overrides=json.dumps(overrides)) self._lm: LanguageModel = archive.model self._lm.delete_softmax() config = archive.config dict_config = config.as_dict(quiet=True) # Extract the name of the tokens that the LM was trained on. text_field_embedder = dict_config["model"]["text_field_embedder"] text_field_embedder = TextFieldEmbedder.from_params( Params(text_field_embedder)) if not isinstance(text_field_embedder, BasicTextFieldEmbedder): raise ConfigurationError( f"Language model from {archive_file} uses a non-standard TextFieldEmbedder!" ) non_empty_embedders = [ name for name, token_embedder in text_field_embedder._token_embedders.items() if not isinstance(token_embedder, EmptyEmbedder) ] if len(non_empty_embedders) == 0: # Only empty embedders were contained in the language model # We need at least one non-empty embedder in the language model raise ConfigurationError( f"Language model from {archive_file} trained with only empty embedders!" ) elif len(non_empty_embedders) > 1: raise ConfigurationError( f"Language model from {archive_file} trained with multiple non-empty embedders!" ) self._token_name = non_empty_embedders[0] # TODO(brendanr): Find a way to remove this hack. The issue fundamentally is that the # BasicTextFieldEmbedder concatenates multiple embedded representations. When a # downstream model uses both, tokens and token characters, say, and only adds bos/eos # tokens to the token characters, the dimensions don't match. See: # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/text_field_embedders/basic_text_field_embedder.py#L109 # # For the equivalent hack in the ELMo embedder see: # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/elmo.py#L590 if bos_eos_tokens: dataset_reader_config = config.get("dataset_reader") token_indexer_config = dataset_reader_config.get( "token_indexers").get(self._token_name) token_indexer: TokenIndexer = TokenIndexer.from_params( token_indexer_config) token_list = [Token(token) for token in bos_eos_tokens] # TODO(brendanr): Obtain these indices from the vocab once the # ELMoTokenCharactersIndexer adds the mappings. bos_eos_indices = token_indexer.tokens_to_indices( token_list, self._lm.vocab)["elmo_tokens"] self._bos_indices = torch.LongTensor(bos_eos_indices[0]) self._eos_indices = torch.LongTensor(bos_eos_indices[1]) else: self._bos_indices = None self._eos_indices = None if dropout: self._dropout = torch.nn.Dropout(dropout) else: self._dropout = lambda x: x self._remove_bos_eos = remove_bos_eos num_layers = self._lm.num_layers() # TODO(brendanr): Consider passing our LM as a custom module to `Elmo` instead. # See https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L76 self._scalar_mix = ScalarMix(mixture_size=num_layers, do_layer_norm=False, trainable=True) character_dim = self._lm._text_field_embedder.get_output_dim() contextual_dim = self._lm._contextualizer.get_output_dim() if contextual_dim % character_dim != 0: raise ConfigurationError( "The output dimensions for the text_field_embedder " + f"({character_dim}) and the contextualizer ({contextual_dim})" + f" from the language model loaded from {archive_file} are " + "not compatible. Please check the config used to train that " + "model and ensure that the output dimension of the " + "text_field_embedder divides the output dimension of the " + "contextualizer.") self._character_embedding_duplication_count = contextual_dim // character_dim for param in self._lm.parameters(): param.requires_grad = requires_grad
def test_read_from_file(self, lazy): reader = LevenshteinReader( token_indexers={"tokens": TokenIndexer.by_name("single_id")()}, lazy=False) self._check_outputs(reader)
def __init__(self, archive_file: str, dropout: float = None, bos_eos_tokens: Tuple[str, str] = ("<S>", "</S>"), remove_bos_eos: bool = True, requires_grad: bool = False) -> None: super().__init__() overrides = {"model": {"contextualizer": {"return_all_layers": True}}} # Import here to avoid circular dependency. from allennlp.models.archival import load_archive # Load LM and the associated config. archive = load_archive(archive_file, overrides=json.dumps(overrides)) self._lm: BidirectionalLanguageModel = archive.model self._lm.delete_softmax() config = archive.config dict_config = config.as_dict(quiet=True) # Extract the name of the tokens that the LM was trained on. text_field_embedder = dict_config["model"]["text_field_embedder"] token_names = list(text_field_embedder["token_embedders"].keys()) if len(token_names) != 1: # We don't currently support embedding with language models trained with multiple # embedded indices. # # Note: We only care about embedded indices. This does not include "tokens" which # is just used to compute the loss in BidirectionalLanguageModel. raise ConfigurationError( f"LM from {archive_file} trained with multiple embedders!") if "embedder_to_indexer_map" in text_field_embedder: # Similarly we don't support multiple indexers per embedder. raise ConfigurationError( f"LM from {archive_file} trained with embedder_to_indexer_map!" ) self._token_name = token_names[0] # TODO(brendanr): Find a way to remove this hack. The issue fundamentally is that the # BasicTextFieldEmbedder concatenates multiple embedded representations. When a # downstream model uses both, tokens and token characters, say, and only adds bos/eos # tokens to the token characters, the dimensions don't match. See: # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/text_field_embedders/basic_text_field_embedder.py#L109 # # For the equivalent hack in the ELMo embedder see: # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/elmo.py#L590 if bos_eos_tokens: dataset_reader_config = config.get("dataset_reader") if dataset_reader_config.get("type") == "multiprocess": dataset_reader_config = dataset_reader_config.get( "base_reader") token_indexer_config = dataset_reader_config.get( "token_indexers").get(self._token_name) token_indexer: TokenIndexer = TokenIndexer.from_params( token_indexer_config) token_list = [Token(token) for token in bos_eos_tokens] # TODO(brendanr): Obtain these indices from the vocab once the # ELMoTokenCharactersIndexer adds the mappings. bos_eos_indices = token_indexer.tokens_to_indices( token_list, self._lm.vocab, "key")["key"] self._bos_indices = torch.Tensor(bos_eos_indices[0]) self._eos_indices = torch.Tensor(bos_eos_indices[1]) else: self._bos_indices = None self._eos_indices = None if dropout: self._dropout = torch.nn.Dropout(dropout) else: self._dropout = lambda x: x self._remove_bos_eos = remove_bos_eos num_layers = self._lm.num_layers() # TODO(brendanr): Consider passing our LM as a custom module to `Elmo` instead. # See https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L76 self._scalar_mix = ScalarMix(mixture_size=num_layers, do_layer_norm=False, trainable=True) # pylint: disable=protected-access character_dim = self._lm._text_field_embedder.get_output_dim() contextual_dim = self._lm._contextualizer.get_output_dim() if contextual_dim % character_dim != 0: raise ConfigurationError( "The output dimensions for the text_field_embedder " + f"({character_dim}) and the contextualizer ({contextual_dim})" + f" from the language model loaded from {archive_file} are " + "not compatible. Please check the config used to train that " + "model and ensure that the output dimension of the " + "text_field_embedder divides the output dimension of the " + "contextualizer.") self._character_embedding_duplication_count = contextual_dim // character_dim for param in self._lm.parameters(): param.requires_grad = requires_grad
def eval_model_fnc_data(db: FeverDocDB, args, mithun_logger, name_of_trained_model_to_use, path_to_trained_models_folder, cuda_device, operation, path_to_fnc_annotated_data) -> Model: print("got inside eval_model_fnc_data") archive = load_archive( path_to_trained_models_folder + name_of_trained_model_to_use, cuda_device) config = archive.config ds_params = config["dataset_reader"] model = archive.model model.eval() reader = FEVERReader(db, sentence_level=ds_params.pop("sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=TokenIndexer.dict_from_params( ds_params.pop('token_indexers', {}))) # do annotation on the fly using pyprocessors. i.e creating NER tags, POS Tags etcThis takes along time. # so almost always we do it only once, and load it from disk . Hence do_annotation_live = False do_annotation_live = False data = reader.read_annotated_fnc_and_do_ner_replacement( args, operation, do_annotation_live, mithun_logger, path_to_fnc_annotated_data).instances joblib.dump(data, "fever_dev_dataset_format.pkl") # ###################end of running model and saving path = os.getcwd() #data=joblib.load(path+"fever_dev_dataset_format") actual = [] predicted = [] if args.log is not None: f = open(args.log, "w+") if_ctr, else_ctr = 0, 0 pred_dict = defaultdict(int) for item in tqdm(data): if item.fields["premise"] is None or item.fields[ "premise"].sequence_length() == 0: cls = "NOT ENOUGH INFO" if_ctr += 1 else: else_ctr += 1 prediction = model.forward_on_instance(item, args.cuda_device) cls = model.vocab._index_to_token["labels"][np.argmax( prediction["label_probs"])] if "label" in item.fields: actual.append(item.fields["label"].label) predicted.append(cls) pred_dict[cls] += 1 if args.log is not None: if "label" in item.fields: f.write( json.dumps({ "actual": item.fields["label"].label, "predicted": cls }) + "\n") else: f.write(json.dumps({"predicted": cls}) + "\n") print(f'if_ctr = {if_ctr}') print(f'else_ctr = {else_ctr}') print(f'pred_dict = {pred_dict}') if args.log is not None: f.close() if len(actual) > 0: print(accuracy_score(actual, predicted)) print(classification_report(actual, predicted)) print(confusion_matrix(actual, predicted)) return model
def eval_model(db: FeverDocDB, args) -> Model: archive = load_archive(args.archive_file, cuda_device=args.cuda_device, overrides=args.overrides) config = archive.config ds_params = config["dataset_reader"] model = archive.model model.eval() reader = FEVERReader(db, sentence_level=ds_params.pop("sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=TokenIndexer.dict_from_params( ds_params.pop('token_indexers', {}))) while True: claim = input("enter claim (or q to quit) >>") if claim.lower() == "q": break ranker = retriever.get_class('tfidf')(tfidf_path=args.model) p_lines = [] pages, _ = ranker.closest_docs(claim, 5) for page in pages: lines = db.get_doc_lines(page) lines = [ line.split("\t")[1] if len(line.split("\t")[1]) > 1 else "" for line in lines.split("\n") ] p_lines.extend(zip(lines, [page] * len(lines), range(len(lines)))) scores = tf_idf_sim(claim, [pl[0] for pl in p_lines]) scores = list( zip(scores, [pl[1] for pl in p_lines], [pl[2] for pl in p_lines], [pl[0] for pl in p_lines])) scores = list(filter(lambda score: len(score[3].strip()), scores)) sentences_l = list( sorted(scores, reverse=True, key=lambda elem: elem[0])) sentences = [s[3] for s in sentences_l[:5]] evidence = " ".join(sentences) print("Best pages: {0}".format(repr(pages))) print("Evidence:") for idx, sentence in enumerate(sentences_l[:5]): print("{0}\t{1}\t\t{2}\t{3}".format(idx + 1, sentence[0], sentence[1], sentence[3])) item = reader.text_to_instance(evidence, claim) prediction = model.forward_on_instance(item, args.cuda_device) cls = model.vocab._index_to_token["labels"][np.argmax( prediction["label_probs"])] print("PREDICTED: {0}".format(cls)) print()
reader = FEVERReader( db, sentence_level=ds_params.pop("sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {"word_splitter": { "type": "indexed_spaces" }})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {"word_splitter": { "type": "indexed_spaces" }})), token_indexers=TokenIndexer.dict_from_params( ds_params.pop('token_indexers', {'tokens': SingleIdTokenIndexer()}))) print("") print("") print("") while True: claim = input("enter claim (or q to quit) >>\t") if claim.lower() == "q": break if len(claim.strip()) < 2: continue print("Pages:") pages = evidence_retriever.get_docs_for_claim(claim)
archive = load_archive( path_to_trained_models_folder + name_of_trained_model_to_use, cuda_device) config = archive.config ds_params = config["dataset_reader"] model = archive.model model.eval() mithun_logger.info(f"going to initiate FEVERReaderUofa.") fever_reader = FEVERReaderUofa( db, sentence_level=ds_params.pop("sentence_level", False), wiki_tokenizer=Tokenizer.from_params( ds_params.pop('wiki_tokenizer', {})), claim_tokenizer=Tokenizer.from_params( ds_params.pop('claim_tokenizer', {})), token_indexers=TokenIndexer.dict_from_params( ds_params.pop('token_indexers', {}))) cwd = os.getcwd() mithun_logger.info(f"going to start reading data.") zipped_annotated_data, length_data = fever_reader.read( mithun_logger, cwd + path_to_pyproc_annotated_data_folder) mithun_logger.info( f"done with reading data. going to generate features.") data = None for feature in features: # todo: right now there is only one feature, NER ONE, so you will get away with data inside this for loop. However, need to dynamically add features fdl = feature + "_details" mithun_logger.info(f"value of fdl is:{fdl}") mithun_logger.info(f"value of feature is:{feature}")
def __init__(self, archive_file: str, dropout: float = None, bos_eos_tokens: Tuple[str, str] = ("<S>", "</S>"), remove_bos_eos: bool = True, requires_grad: bool = False) -> None: super().__init__() overrides = { "model": { "contextualizer": { "return_all_layers": True } } } # Import here to avoid circular dependency. from allennlp.models.archival import load_archive # Load LM and the associated config. archive = load_archive(archive_file, overrides=json.dumps(overrides)) self._lm: LanguageModel = archive.model self._lm.delete_softmax() config = archive.config dict_config = config.as_dict(quiet=True) # Extract the name of the tokens that the LM was trained on. text_field_embedder = dict_config["model"]["text_field_embedder"] token_names = list(text_field_embedder["token_embedders"].keys()) if len(token_names) != 1: # We don't currently support embedding with language models trained with multiple # embedded indices. # # Note: We only care about embedded indices. This does not include "tokens" which # is just used to compute the loss in LanguageModel. raise ConfigurationError(f"LM from {archive_file} trained with multiple embedders!") if "embedder_to_indexer_map" in text_field_embedder: # Similarly we don't support multiple indexers per embedder. raise ConfigurationError(f"LM from {archive_file} trained with embedder_to_indexer_map!") self._token_name = token_names[0] # TODO(brendanr): Find a way to remove this hack. The issue fundamentally is that the # BasicTextFieldEmbedder concatenates multiple embedded representations. When a # downstream model uses both, tokens and token characters, say, and only adds bos/eos # tokens to the token characters, the dimensions don't match. See: # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/text_field_embedders/basic_text_field_embedder.py#L109 # # For the equivalent hack in the ELMo embedder see: # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/elmo.py#L590 if bos_eos_tokens: dataset_reader_config = config.get("dataset_reader") if dataset_reader_config.get("type") == "multiprocess": dataset_reader_config = dataset_reader_config.get("base_reader") token_indexer_config = dataset_reader_config.get("token_indexers").get(self._token_name) token_indexer: TokenIndexer = TokenIndexer.from_params(token_indexer_config) token_list = [Token(token) for token in bos_eos_tokens] # TODO(brendanr): Obtain these indices from the vocab once the # ELMoTokenCharactersIndexer adds the mappings. bos_eos_indices = token_indexer.tokens_to_indices(token_list, self._lm.vocab, "key")["key"] self._bos_indices = torch.Tensor(bos_eos_indices[0]) self._eos_indices = torch.Tensor(bos_eos_indices[1]) else: self._bos_indices = None self._eos_indices = None if dropout: self._dropout = torch.nn.Dropout(dropout) else: self._dropout = lambda x: x self._remove_bos_eos = remove_bos_eos num_layers = self._lm.num_layers() # TODO(brendanr): Consider passing our LM as a custom module to `Elmo` instead. # See https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L76 self._scalar_mix = ScalarMix(mixture_size=num_layers, do_layer_norm=False, trainable=True) # pylint: disable=protected-access character_dim = self._lm._text_field_embedder.get_output_dim() contextual_dim = self._lm._contextualizer.get_output_dim() if contextual_dim % character_dim != 0: raise ConfigurationError( "The output dimensions for the text_field_embedder " + f"({character_dim}) and the contextualizer ({contextual_dim})" + f" from the language model loaded from {archive_file} are " + "not compatible. Please check the config used to train that " + "model and ensure that the output dimension of the " + "text_field_embedder divides the output dimension of the " + "contextualizer.") self._character_embedding_duplication_count = contextual_dim // character_dim for param in self._lm.parameters(): param.requires_grad = requires_grad
def __init__( self, experiment_name: str, vocab: Vocabulary, question_embedder: TextFieldEmbedder, schema_encoder: Seq2SeqEncoder, beam_encoder: Seq2SeqEncoder, tree_rep_transformer: Seq2SeqEncoder, utterance_augmenter: Seq2SeqEncoder, beam_summarizer: Seq2SeqEncoder, decoder_timesteps=9, beam_size=30, misc_params=None, dropout: float = 0.1, ) -> None: super().__init__(vocab) self._experiment_name = experiment_name self._misc_params = misc_params self.set_flags() self._utterance_augmenter = utterance_augmenter self._action_dim = beam_encoder.get_output_dim() self._beam_size = beam_size self._n_schema_leafs = 15 self._num_values = 10 self.tokenizer = TokenIndexer.by_name("pretrained_transformer")( model_name="Salesforce/grappa_large_jnt" )._allennlp_tokenizer.tokenizer if not self.cntx_reranker: self._noreranker_cntx_linear = torch.nn.Linear( in_features=self._action_dim, out_features=2 * self._action_dim) if not self.utt_aug: self._nobeam_cntx_linear = torch.nn.Linear( in_features=self._action_dim, out_features=2 * self._action_dim) self.activation_func = torch.nn.ReLU if self.lin_after_cntx: self.cntx_linear = torch.nn.Sequential( torch.nn.Linear(2 * self._action_dim, 4 * self._action_dim), torch.nn.Dropout(p=dropout), torch.nn.LayerNorm(4 * self._action_dim), self.activation_func(), torch.nn.Linear(4 * self._action_dim, 2 * self._action_dim), ) if self.cntx_rep: self._cntx_rep_linear = torch.nn.Linear( in_features=self._action_dim, out_features=2 * self._action_dim) self._create_action_dicts() self.op_count = self.binary_op_count + self.unary_op_count self.xent = torch.nn.CrossEntropyLoss() self.type_embedding = torch.nn.Embedding(self.op_count, self._action_dim) self.summrize_vec = torch.nn.Embedding(num_embeddings=1, embedding_dim=self._action_dim) self.d_frontier = 2 * self._action_dim self.left_emb = torch.nn.Linear(in_features=self.d_frontier, out_features=self.d_frontier) self.right_emb = torch.nn.Linear(in_features=self.d_frontier, out_features=self.d_frontier) self.after_add = torch.nn.Sequential( torch.nn.Linear(self.d_frontier, self.d_frontier), torch.nn.Dropout(p=dropout), torch.nn.LayerNorm(self.d_frontier), self.activation_func(), torch.nn.Linear(self.d_frontier, self.d_frontier), ) self._unary_frontier_embedder = torch.nn.Sequential( torch.nn.Linear(self.d_frontier, self.d_frontier), torch.nn.Dropout(p=dropout), torch.nn.LayerNorm(self.d_frontier), self.activation_func(), torch.nn.Linear(self.d_frontier, self.d_frontier), ) self.op_linear = torch.nn.Linear(in_features=self.d_frontier, out_features=self.op_count) self.pre_op_linear = torch.nn.Sequential( torch.nn.Linear(self.d_frontier, self.d_frontier), torch.nn.Dropout(p=dropout), torch.nn.LayerNorm(self.d_frontier), self.activation_func(), ) assert (self._action_dim % 2) == 0 self.vocab = vocab self._question_embedder = question_embedder self._schema_encoder = schema_encoder self._beam_encoder = beam_encoder self._beam_summarizer = beam_summarizer self._tree_rep_transformer = tree_rep_transformer self._decoder_timesteps = decoder_timesteps self._beam_size = beam_size self.q_emb_dim = question_embedder.get_output_dim() self.dropout_prob = dropout self._action_dim = beam_encoder.get_output_dim() self._span_score_func = torch.nn.Linear(self._action_dim, 2) self._pooler = BagOfEmbeddingsEncoder(embedding_dim=self._action_dim) self._rank_schema = torch.nn.Sequential( torch.nn.Linear(self._action_dim, self._action_dim), torch.nn.Dropout(p=dropout), torch.nn.LayerNorm(self._action_dim), torch.nn.Tanh(), torch.nn.Linear(self._action_dim, 1), ) self._rank_beam = torch.nn.Sequential( torch.nn.Linear(2 * self._action_dim, 2 * self._action_dim), torch.nn.Dropout(p=dropout), torch.nn.LayerNorm(2 * self._action_dim), torch.nn.Tanh(), torch.nn.Linear(2 * self._action_dim, 1), ) self._emb_to_action_dim = torch.nn.Linear( in_features=self.q_emb_dim, out_features=self._action_dim, ) self._create_type_tensor() self._bce_loss = torch.nn.BCEWithLogitsLoss(reduction="none") self._softmax = torch.nn.Softmax(dim=1) self._final_beam_acc = Average() self._reranker_acc = Average() self._spider_acc = Average() self._leafs_acc = Average() self._batch_size = -1 self._device = None self._evaluate_func = partial( evaluate_single, db_dir=os.path.join("dataset", "database"), table_file=os.path.join("dataset", "tables.json"), )