Exemple #1
0
def get_bert_test_fixture():
    embedder_params = {
        "type": "bert-pretrained",
        "pretrained_model": "tests/fixtures/bert/bert_test_fixture.tar.gz",
        "requires_grad": True,
        "top_layer_only": True,
    }
    embedder_params_copy = dict(embedder_params)
    embedder = TokenEmbedder.from_params(Params(embedder_params))

    indexer_params = {
        "type": "bert-pretrained",
        "pretrained_model": "tests/fixtures/bert/vocab.txt",
        "do_lowercase": True,
        "use_starting_offsets": True,
        "max_pieces": 512,
    }
    indexer_params_copy = dict(indexer_params)
    indexer = TokenIndexer.from_params(Params(indexer_params))

    return {
        'embedder': embedder,
        'embedder_params': embedder_params_copy,
        'indexer': indexer,
        'indexer_params': indexer_params_copy
    }
Exemple #2
0
    def compile_featurizer(self, tokenizer: Tokenizer) -> InputFeaturizer:
        """Creates the featurizer based on the configured input features

        :::tip
        If you are creating configurations programmatically
        use this method to check that you provided a valid configuration.
        :::

        Parameters
        ----------
        tokenizer
            Tokenizer used for this featurizer

        Returns
        -------
        featurizer
            The configured `InputFeaturizer`
        """
        configuration = self._make_allennlp_config()

        indexer = {
            feature_namespace: TokenIndexer.from_params(Params(config["indexer"]))
            for feature_namespace, config in configuration.items()
        }

        return InputFeaturizer(tokenizer, indexer=indexer)
Exemple #3
0
 def from_params(cls, params: Params):
     """
     Parameters
     ----------
     squad_filename : ``str``
     negative_sentence_selection : ``str``, optional (default=``"paragraph"``)
     tokenizer : ``Params``, optional
     token_indexers: ``List[Params]``, optional
     """
     negative_sentence_selection = params.pop('negative_sentence_selection',
                                              'paragraph')
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', Params({}))
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class,
     # so if no parameters are given we must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return SquadSentenceSelectionReader(
         negative_sentence_selection=negative_sentence_selection,
         tokenizer=tokenizer,
         token_indexers=token_indexers)
Exemple #4
0
def eval_model(db: FeverDocDB, args) -> Model:
    archive = load_archive(args.archive_file, cuda_device=args.cuda_device)

    config = archive.config
    ds_params = config["dataset_reader"]

    model = archive.model
    model.eval()

    reader = FEVERReader(db,
                         sentence_level=ds_params.pop("sentence_level", False),
                         wiki_tokenizer=Tokenizer.from_params(
                             ds_params.pop('wiki_tokenizer', {})),
                         claim_tokenizer=Tokenizer.from_params(
                             ds_params.pop('claim_tokenizer', {})),
                         token_indexers=TokenIndexer.dict_from_params(
                             ds_params.pop('token_indexers', {})))

    logger.info("Reading training data from %s", args.in_file)
    data = reader.read(args.in_file).instances

    actual = []
    predicted = []

    if args.log is not None:
        f = open(args.log, "w+")

    for item in tqdm(data):
        if item.fields["premise"] is None or item.fields[
                "premise"].sequence_length() == 0:
            cls = "NOT ENOUGH INFO"
        else:
            prediction = model.forward_on_instance(item, args.cuda_device)
            cls = model.vocab._index_to_token["labels"][np.argmax(
                prediction["label_probs"])]

        if "label" in item.fields:
            actual.append(item.fields["label"].label)
        predicted.append(cls)

        if args.log is not None:
            if "label" in item.fields:
                f.write(
                    json.dumps({
                        "actual": item.fields["label"].label,
                        "predicted": cls
                    }) + "\n")
            else:
                f.write(json.dumps({"predicted": cls}) + "\n")

    if args.log is not None:
        f.close()

    if len(actual) > 0:
        print(accuracy_score(actual, predicted))
        print(classification_report(actual, predicted))
        print(confusion_matrix(actual, predicted))

    return model
Exemple #5
0
def _get_entity_indexers():
    indexer_params = Params({
        "type": "characters_tokenizer",
        "tokenizer": {
            "type": "word",
            "word_splitter": {
                "type": "just_spaces"
            },
        },
        "namespace": "entity"
    })
    return {'wordnet': TokenIndexer.from_params(indexer_params)}
Exemple #6
0
 def _get_indexer(namespace):
     return TokenIndexer.from_params(
         Params({
             "type": "characters_tokenizer",
             "tokenizer": {
                 "type": "word",
                 "word_splitter": {
                     "type": "just_spaces"
                 },
             },
             "namespace": namespace
         }))
Exemple #7
0
 def from_params(cls, params: Params):
     """
     Parameters
     ----------
     token_indexers: ``Dict[Params]``, optional
     """
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', Params({}))
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class,
     # so if no parameters are given we must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return SequenceTaggingDatasetReader(token_indexers=token_indexers)
Exemple #8
0
def token_indexer_dict_from_params(
        params: Params) -> 'Dict[str, TokenIndexer]':  # type: ignore
    """
    We typically use ``TokenIndexers`` in a dictionary, with each ``TokenIndexer`` getting a
    name.  The specification for this in a ``Params`` object is typically ``{"name" ->
    {indexer_params}}``.  This method reads that whole set of parameters and returns a
    dictionary suitable for use in a ``TextField``.

    Because default values for token indexers are typically handled in the calling class to
    this and are based on checking for ``None``, if there were no parameters specifying any
    token indexers in the given ``params``, we return ``None`` instead of an empty dictionary.
    """
    token_indexers = {}
    for name, indexer_params in params.items():
        token_indexers[name] = TokenIndexer.from_params(indexer_params)
    if token_indexers == {}:
        token_indexers = None
    return token_indexers
Exemple #9
0
 def from_params(cls, params: Params):
     """
     Parameters
     ----------
     filename : ``str``
     tokenizer : ``Params``, optional
     token_indexers: ``List[Params]``, optional
     """
     tokenizer = Tokenizer.from_params(params.pop('tokenizer', {}))
     token_indexers = {}
     token_indexer_params = params.pop('token_indexers', Params({}))
     for name, indexer_params in token_indexer_params.items():
         token_indexers[name] = TokenIndexer.from_params(indexer_params)
     # The default parameters are contained within the class,
     # so if no parameters are given we must pass None.
     if token_indexers == {}:
         token_indexers = None
     params.assert_empty(cls.__name__)
     return SnliReader(tokenizer=tokenizer, token_indexers=token_indexers)
Exemple #10
0
    def test_token_characters_indexer_tokenizer(self):
        params = Params({
            "type": "characters_tokenizer",
            "tokenizer": {
                "type": "word",
                "word_splitter": {
                    "type": "just_spaces"
                },
            },
            "namespace": "tok"
        })

        indexer = TokenIndexer.from_params(params)

        vocab = Vocabulary()
        vocab.add_token_to_namespace("the", namespace="tok")
        vocab.add_token_to_namespace("2", namespace="tok")

        indices = indexer.tokens_to_indices(
            [Token(t) for t in "the 2 .".split()], vocab, 'a')

        self.assertListEqual(indices['a'], [[2], [3], [1]])
Exemple #11
0
def train_model(db: FeverDocDB, params: Union[Params, Dict[str, Any]],
                cuda_device: int, serialization_dir: str,
                filtering: str) -> Model:
    """
    This function can be used as an entry point to running models in AllenNLP
    directly from a JSON specification using a :class:`Driver`. Note that if
    you care about reproducibility, you should avoid running code using Pytorch
    or numpy which affect the reproducibility of your experiment before you
    import and use this function, these libraries rely on random seeds which
    can be set in this function via a JSON specification file. Note that this
    function performs training and will also evaluate the trained model on
    development and test sets if provided in the parameter json.

    Parameters
    ----------
    params: Params, required.
        A parameter object specifying an AllenNLP Experiment.
    serialization_dir: str, required
        The directory in which to save results and logs.
    """

    SimpleRandom.set_seeds()

    os.makedirs(serialization_dir, exist_ok=True)
    sys.stdout = TeeLogger(os.path.join(serialization_dir, "stdout.log"),
                           sys.stdout)  # type: ignore
    sys.stderr = TeeLogger(os.path.join(serialization_dir, "stderr.log"),
                           sys.stderr)  # type: ignore
    handler = logging.FileHandler(
        os.path.join(serialization_dir, "python_logging.log"))
    handler.setLevel(logging.INFO)
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s - %(levelname)s - %(name)s - %(message)s'))
    logging.getLogger().addHandler(handler)
    serialization_params = deepcopy(params).as_dict(quiet=True)

    with open(os.path.join(serialization_dir, "model_params.json"),
              "w") as param_file:
        json.dump(serialization_params, param_file, indent=4)

    # Now we begin assembling the required parts for the Trainer.
    ds_params = params.pop('dataset_reader', {})
    dataset_reader = FEVERReader(db,
                                 sentence_level=ds_params.pop(
                                     "sentence_level", False),
                                 wiki_tokenizer=Tokenizer.from_params(
                                     ds_params.pop('wiki_tokenizer', {})),
                                 claim_tokenizer=Tokenizer.from_params(
                                     ds_params.pop('claim_tokenizer', {})),
                                 token_indexers=TokenIndexer.dict_from_params(
                                     ds_params.pop('token_indexers', {})),
                                 filtering=filtering)

    train_data_path = params.pop('train_data_path')
    logger.info("Reading training data from %s", train_data_path)
    train_data = dataset_reader.read(train_data_path)

    all_datasets = [train_data]
    datasets_in_vocab = ["train"]

    validation_data_path = params.pop('validation_data_path', None)
    if validation_data_path is not None:
        logger.info("Reading validation data from %s", validation_data_path)
        validation_data = dataset_reader.read(validation_data_path)
        all_datasets.append(validation_data)
        datasets_in_vocab.append("validation")
    else:
        validation_data = None

    logger.info("Creating a vocabulary using %s data.",
                ", ".join(datasets_in_vocab))
    vocab = Vocabulary.from_params(
        params.pop("vocabulary", {}),
        Dataset([
            instance for dataset in all_datasets
            for instance in dataset.instances
        ]))
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    model = Model.from_params(vocab, params.pop('model'))
    iterator = DataIterator.from_params(params.pop("iterator"))

    train_data.index_instances(vocab)
    if validation_data:
        validation_data.index_instances(vocab)

    trainer_params = params.pop("trainer")
    if cuda_device is not None:
        trainer_params["cuda_device"] = cuda_device
    trainer = Trainer.from_params(model, serialization_dir, iterator,
                                  train_data, validation_data, trainer_params)

    trainer.train()

    # Now tar up results
    archive_model(serialization_dir)

    return model
    def __init__(
        self,
        archive_file: str,
        dropout: float = None,
        bos_eos_tokens: Tuple[str, str] = ("<S>", "</S>"),
        remove_bos_eos: bool = True,
        requires_grad: bool = False,
    ) -> None:
        super().__init__()

        overrides = {"model": {"contextualizer": {"return_all_layers": True}}}

        # Import here to avoid circular dependency.
        from allennlp.models.archival import load_archive

        # Load LM and the associated config.
        archive = load_archive(archive_file, overrides=json.dumps(overrides))
        self._lm: LanguageModel = archive.model
        self._lm.delete_softmax()
        config = archive.config
        dict_config = config.as_dict(quiet=True)

        # Extract the name of the tokens that the LM was trained on.
        text_field_embedder = dict_config["model"]["text_field_embedder"]
        text_field_embedder = TextFieldEmbedder.from_params(
            Params(text_field_embedder))
        if not isinstance(text_field_embedder, BasicTextFieldEmbedder):
            raise ConfigurationError(
                f"Language model from {archive_file} uses a non-standard TextFieldEmbedder!"
            )
        non_empty_embedders = [
            name for name, token_embedder in
            text_field_embedder._token_embedders.items()
            if not isinstance(token_embedder, EmptyEmbedder)
        ]

        if len(non_empty_embedders) == 0:
            # Only empty embedders were contained in the language model
            # We need at least one non-empty embedder in the language model
            raise ConfigurationError(
                f"Language model from {archive_file} trained with only empty embedders!"
            )
        elif len(non_empty_embedders) > 1:
            raise ConfigurationError(
                f"Language model from {archive_file} trained with multiple non-empty embedders!"
            )

        self._token_name = non_empty_embedders[0]

        # TODO(brendanr): Find a way to remove this hack. The issue fundamentally is that the
        # BasicTextFieldEmbedder concatenates multiple embedded representations. When a
        # downstream model uses both, tokens and token characters, say, and only adds bos/eos
        # tokens to the token characters, the dimensions don't match. See:
        # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/text_field_embedders/basic_text_field_embedder.py#L109
        #
        # For the equivalent hack in the ELMo embedder see:
        # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/elmo.py#L590
        if bos_eos_tokens:
            dataset_reader_config = config.get("dataset_reader")
            token_indexer_config = dataset_reader_config.get(
                "token_indexers").get(self._token_name)
            token_indexer: TokenIndexer = TokenIndexer.from_params(
                token_indexer_config)
            token_list = [Token(token) for token in bos_eos_tokens]
            # TODO(brendanr): Obtain these indices from the vocab once the
            # ELMoTokenCharactersIndexer adds the mappings.
            bos_eos_indices = token_indexer.tokens_to_indices(
                token_list, self._lm.vocab)["elmo_tokens"]
            self._bos_indices = torch.LongTensor(bos_eos_indices[0])
            self._eos_indices = torch.LongTensor(bos_eos_indices[1])
        else:
            self._bos_indices = None
            self._eos_indices = None

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = lambda x: x

        self._remove_bos_eos = remove_bos_eos
        num_layers = self._lm.num_layers()
        # TODO(brendanr): Consider passing our LM as a custom module to `Elmo` instead.
        # See https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L76
        self._scalar_mix = ScalarMix(mixture_size=num_layers,
                                     do_layer_norm=False,
                                     trainable=True)

        character_dim = self._lm._text_field_embedder.get_output_dim()
        contextual_dim = self._lm._contextualizer.get_output_dim()

        if contextual_dim % character_dim != 0:
            raise ConfigurationError(
                "The output dimensions for the text_field_embedder " +
                f"({character_dim}) and the contextualizer ({contextual_dim})"
                + f" from the language model loaded from {archive_file} are " +
                "not compatible. Please check the config used to train that " +
                "model and ensure that the output dimension of the " +
                "text_field_embedder divides the output dimension of the " +
                "contextualizer.")
        self._character_embedding_duplication_count = contextual_dim // character_dim

        for param in self._lm.parameters():
            param.requires_grad = requires_grad
Exemple #13
0
 def test_read_from_file(self, lazy):
     reader = LevenshteinReader(
         token_indexers={"tokens": TokenIndexer.by_name("single_id")()},
         lazy=False)
     self._check_outputs(reader)
    def __init__(self,
                 archive_file: str,
                 dropout: float = None,
                 bos_eos_tokens: Tuple[str, str] = ("<S>", "</S>"),
                 remove_bos_eos: bool = True,
                 requires_grad: bool = False) -> None:
        super().__init__()

        overrides = {"model": {"contextualizer": {"return_all_layers": True}}}

        # Import here to avoid circular dependency.
        from allennlp.models.archival import load_archive
        # Load LM and the associated config.
        archive = load_archive(archive_file, overrides=json.dumps(overrides))
        self._lm: BidirectionalLanguageModel = archive.model
        self._lm.delete_softmax()
        config = archive.config
        dict_config = config.as_dict(quiet=True)

        # Extract the name of the tokens that the LM was trained on.
        text_field_embedder = dict_config["model"]["text_field_embedder"]
        token_names = list(text_field_embedder["token_embedders"].keys())
        if len(token_names) != 1:
            # We don't currently support embedding with language models trained with multiple
            # embedded indices.
            #
            # Note: We only care about embedded indices. This does not include "tokens" which
            # is just used to compute the loss in BidirectionalLanguageModel.
            raise ConfigurationError(
                f"LM from {archive_file} trained with multiple embedders!")
        if "embedder_to_indexer_map" in text_field_embedder:
            # Similarly we don't support multiple indexers per embedder.
            raise ConfigurationError(
                f"LM from {archive_file} trained with embedder_to_indexer_map!"
            )
        self._token_name = token_names[0]

        # TODO(brendanr): Find a way to remove this hack. The issue fundamentally is that the
        # BasicTextFieldEmbedder concatenates multiple embedded representations. When a
        # downstream model uses both, tokens and token characters, say, and only adds bos/eos
        # tokens to the token characters, the dimensions don't match. See:
        # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/text_field_embedders/basic_text_field_embedder.py#L109
        #
        # For the equivalent hack in the ELMo embedder see:
        # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/elmo.py#L590
        if bos_eos_tokens:
            dataset_reader_config = config.get("dataset_reader")
            if dataset_reader_config.get("type") == "multiprocess":
                dataset_reader_config = dataset_reader_config.get(
                    "base_reader")
            token_indexer_config = dataset_reader_config.get(
                "token_indexers").get(self._token_name)
            token_indexer: TokenIndexer = TokenIndexer.from_params(
                token_indexer_config)
            token_list = [Token(token) for token in bos_eos_tokens]
            # TODO(brendanr): Obtain these indices from the vocab once the
            # ELMoTokenCharactersIndexer adds the mappings.
            bos_eos_indices = token_indexer.tokens_to_indices(
                token_list, self._lm.vocab, "key")["key"]
            self._bos_indices = torch.Tensor(bos_eos_indices[0])
            self._eos_indices = torch.Tensor(bos_eos_indices[1])
        else:
            self._bos_indices = None
            self._eos_indices = None

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = lambda x: x

        self._remove_bos_eos = remove_bos_eos
        num_layers = self._lm.num_layers()
        # TODO(brendanr): Consider passing our LM as a custom module to `Elmo` instead.
        # See https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L76
        self._scalar_mix = ScalarMix(mixture_size=num_layers,
                                     do_layer_norm=False,
                                     trainable=True)

        # pylint: disable=protected-access
        character_dim = self._lm._text_field_embedder.get_output_dim()
        contextual_dim = self._lm._contextualizer.get_output_dim()

        if contextual_dim % character_dim != 0:
            raise ConfigurationError(
                "The output dimensions for the text_field_embedder " +
                f"({character_dim}) and the contextualizer ({contextual_dim})"
                + f" from the language model loaded from {archive_file} are " +
                "not compatible. Please check the config used to train that " +
                "model and ensure that the output dimension of the " +
                "text_field_embedder divides the output dimension of the " +
                "contextualizer.")
        self._character_embedding_duplication_count = contextual_dim // character_dim

        for param in self._lm.parameters():
            param.requires_grad = requires_grad
Exemple #15
0
def eval_model_fnc_data(db: FeverDocDB, args, mithun_logger,
                        name_of_trained_model_to_use,
                        path_to_trained_models_folder, cuda_device, operation,
                        path_to_fnc_annotated_data) -> Model:

    print("got inside eval_model_fnc_data")
    archive = load_archive(
        path_to_trained_models_folder + name_of_trained_model_to_use,
        cuda_device)
    config = archive.config
    ds_params = config["dataset_reader"]

    model = archive.model
    model.eval()

    reader = FEVERReader(db,
                         sentence_level=ds_params.pop("sentence_level", False),
                         wiki_tokenizer=Tokenizer.from_params(
                             ds_params.pop('wiki_tokenizer', {})),
                         claim_tokenizer=Tokenizer.from_params(
                             ds_params.pop('claim_tokenizer', {})),
                         token_indexers=TokenIndexer.dict_from_params(
                             ds_params.pop('token_indexers', {})))

    # do annotation on the fly  using pyprocessors. i.e creating NER tags, POS Tags etcThis takes along time.
    #  so almost always we do it only once, and load it from disk . Hence do_annotation_live = False
    do_annotation_live = False

    data = reader.read_annotated_fnc_and_do_ner_replacement(
        args, operation, do_annotation_live, mithun_logger,
        path_to_fnc_annotated_data).instances
    joblib.dump(data, "fever_dev_dataset_format.pkl")
    #
    ###################end of running model and saving

    path = os.getcwd()

    #data=joblib.load(path+"fever_dev_dataset_format")

    actual = []

    predicted = []

    if args.log is not None:
        f = open(args.log, "w+")
    if_ctr, else_ctr = 0, 0
    pred_dict = defaultdict(int)

    for item in tqdm(data):
        if item.fields["premise"] is None or item.fields[
                "premise"].sequence_length() == 0:
            cls = "NOT ENOUGH INFO"
            if_ctr += 1
        else:
            else_ctr += 1

            prediction = model.forward_on_instance(item, args.cuda_device)
            cls = model.vocab._index_to_token["labels"][np.argmax(
                prediction["label_probs"])]

        if "label" in item.fields:
            actual.append(item.fields["label"].label)
        predicted.append(cls)
        pred_dict[cls] += 1

        if args.log is not None:
            if "label" in item.fields:
                f.write(
                    json.dumps({
                        "actual": item.fields["label"].label,
                        "predicted": cls
                    }) + "\n")
            else:
                f.write(json.dumps({"predicted": cls}) + "\n")
    print(f'if_ctr = {if_ctr}')
    print(f'else_ctr = {else_ctr}')
    print(f'pred_dict = {pred_dict}')

    if args.log is not None:
        f.close()

    if len(actual) > 0:
        print(accuracy_score(actual, predicted))
        print(classification_report(actual, predicted))
        print(confusion_matrix(actual, predicted))

    return model
Exemple #16
0
def eval_model(db: FeverDocDB, args) -> Model:
    archive = load_archive(args.archive_file,
                           cuda_device=args.cuda_device,
                           overrides=args.overrides)

    config = archive.config
    ds_params = config["dataset_reader"]

    model = archive.model
    model.eval()

    reader = FEVERReader(db,
                         sentence_level=ds_params.pop("sentence_level", False),
                         wiki_tokenizer=Tokenizer.from_params(
                             ds_params.pop('wiki_tokenizer', {})),
                         claim_tokenizer=Tokenizer.from_params(
                             ds_params.pop('claim_tokenizer', {})),
                         token_indexers=TokenIndexer.dict_from_params(
                             ds_params.pop('token_indexers', {})))

    while True:

        claim = input("enter claim (or q to quit) >>")
        if claim.lower() == "q":
            break

        ranker = retriever.get_class('tfidf')(tfidf_path=args.model)

        p_lines = []
        pages, _ = ranker.closest_docs(claim, 5)

        for page in pages:
            lines = db.get_doc_lines(page)
            lines = [
                line.split("\t")[1] if len(line.split("\t")[1]) > 1 else ""
                for line in lines.split("\n")
            ]

            p_lines.extend(zip(lines, [page] * len(lines), range(len(lines))))

        scores = tf_idf_sim(claim, [pl[0] for pl in p_lines])
        scores = list(
            zip(scores, [pl[1] for pl in p_lines], [pl[2] for pl in p_lines],
                [pl[0] for pl in p_lines]))
        scores = list(filter(lambda score: len(score[3].strip()), scores))
        sentences_l = list(
            sorted(scores, reverse=True, key=lambda elem: elem[0]))

        sentences = [s[3] for s in sentences_l[:5]]
        evidence = " ".join(sentences)

        print("Best pages: {0}".format(repr(pages)))

        print("Evidence:")
        for idx, sentence in enumerate(sentences_l[:5]):
            print("{0}\t{1}\t\t{2}\t{3}".format(idx + 1, sentence[0],
                                                sentence[1], sentence[3]))

        item = reader.text_to_instance(evidence, claim)

        prediction = model.forward_on_instance(item, args.cuda_device)
        cls = model.vocab._index_to_token["labels"][np.argmax(
            prediction["label_probs"])]
        print("PREDICTED: {0}".format(cls))
        print()
Exemple #17
0
    reader = FEVERReader(
        db,
        sentence_level=ds_params.pop("sentence_level", False),
        wiki_tokenizer=Tokenizer.from_params(
            ds_params.pop('wiki_tokenizer',
                          {"word_splitter": {
                              "type": "indexed_spaces"
                          }})),
        claim_tokenizer=Tokenizer.from_params(
            ds_params.pop('claim_tokenizer',
                          {"word_splitter": {
                              "type": "indexed_spaces"
                          }})),
        token_indexers=TokenIndexer.dict_from_params(
            ds_params.pop('token_indexers',
                          {'tokens': SingleIdTokenIndexer()})))

    print("")
    print("")
    print("")
    while True:
        claim = input("enter claim (or q to quit) >>\t")
        if claim.lower() == "q":
            break

        if len(claim.strip()) < 2:
            continue

        print("Pages:")
        pages = evidence_retriever.get_docs_for_claim(claim)
        archive = load_archive(
            path_to_trained_models_folder + name_of_trained_model_to_use,
            cuda_device)
        config = archive.config
        ds_params = config["dataset_reader"]
        model = archive.model
        model.eval()
        mithun_logger.info(f"going to initiate FEVERReaderUofa.")
        fever_reader = FEVERReaderUofa(
            db,
            sentence_level=ds_params.pop("sentence_level", False),
            wiki_tokenizer=Tokenizer.from_params(
                ds_params.pop('wiki_tokenizer', {})),
            claim_tokenizer=Tokenizer.from_params(
                ds_params.pop('claim_tokenizer', {})),
            token_indexers=TokenIndexer.dict_from_params(
                ds_params.pop('token_indexers', {})))

        cwd = os.getcwd()
        mithun_logger.info(f"going to start reading data.")
        zipped_annotated_data, length_data = fever_reader.read(
            mithun_logger, cwd + path_to_pyproc_annotated_data_folder)

        mithun_logger.info(
            f"done with reading data. going to generate features.")

        data = None
        for feature in features:
            # todo: right now there is only one feature, NER ONE, so you will get away with data inside this for loop. However, need to dynamically add features
            fdl = feature + "_details"
            mithun_logger.info(f"value of fdl is:{fdl}")
            mithun_logger.info(f"value of feature is:{feature}")
    def __init__(self,
                 archive_file: str,
                 dropout: float = None,
                 bos_eos_tokens: Tuple[str, str] = ("<S>", "</S>"),
                 remove_bos_eos: bool = True,
                 requires_grad: bool = False) -> None:
        super().__init__()

        overrides = {
                "model": {
                        "contextualizer": {
                                "return_all_layers": True
                        }
                }
        }

        # Import here to avoid circular dependency.
        from allennlp.models.archival import load_archive
        # Load LM and the associated config.
        archive = load_archive(archive_file, overrides=json.dumps(overrides))
        self._lm: LanguageModel = archive.model
        self._lm.delete_softmax()
        config = archive.config
        dict_config = config.as_dict(quiet=True)

        # Extract the name of the tokens that the LM was trained on.
        text_field_embedder = dict_config["model"]["text_field_embedder"]
        token_names = list(text_field_embedder["token_embedders"].keys())
        if len(token_names) != 1:
            # We don't currently support embedding with language models trained with multiple
            # embedded indices.
            #
            # Note: We only care about embedded indices. This does not include "tokens" which
            # is just used to compute the loss in LanguageModel.
            raise ConfigurationError(f"LM from {archive_file} trained with multiple embedders!")
        if "embedder_to_indexer_map" in text_field_embedder:
            # Similarly we don't support multiple indexers per embedder.
            raise ConfigurationError(f"LM from {archive_file} trained with embedder_to_indexer_map!")
        self._token_name = token_names[0]

        # TODO(brendanr): Find a way to remove this hack. The issue fundamentally is that the
        # BasicTextFieldEmbedder concatenates multiple embedded representations. When a
        # downstream model uses both, tokens and token characters, say, and only adds bos/eos
        # tokens to the token characters, the dimensions don't match. See:
        # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/text_field_embedders/basic_text_field_embedder.py#L109
        #
        # For the equivalent hack in the ELMo embedder see:
        # https://github.com/allenai/allennlp/blob/eff25a3085aa9976a7650d30d8961c3626ddc411/allennlp/modules/elmo.py#L590
        if bos_eos_tokens:
            dataset_reader_config = config.get("dataset_reader")
            if dataset_reader_config.get("type") == "multiprocess":
                dataset_reader_config = dataset_reader_config.get("base_reader")
            token_indexer_config = dataset_reader_config.get("token_indexers").get(self._token_name)
            token_indexer: TokenIndexer = TokenIndexer.from_params(token_indexer_config)
            token_list = [Token(token) for token in bos_eos_tokens]
            # TODO(brendanr): Obtain these indices from the vocab once the
            # ELMoTokenCharactersIndexer adds the mappings.
            bos_eos_indices = token_indexer.tokens_to_indices(token_list, self._lm.vocab, "key")["key"]
            self._bos_indices = torch.Tensor(bos_eos_indices[0])
            self._eos_indices = torch.Tensor(bos_eos_indices[1])
        else:
            self._bos_indices = None
            self._eos_indices = None

        if dropout:
            self._dropout = torch.nn.Dropout(dropout)
        else:
            self._dropout = lambda x: x

        self._remove_bos_eos = remove_bos_eos
        num_layers = self._lm.num_layers()
        # TODO(brendanr): Consider passing our LM as a custom module to `Elmo` instead.
        # See https://github.com/allenai/allennlp/blob/master/allennlp/modules/elmo.py#L76
        self._scalar_mix = ScalarMix(mixture_size=num_layers, do_layer_norm=False, trainable=True)

        # pylint: disable=protected-access
        character_dim = self._lm._text_field_embedder.get_output_dim()
        contextual_dim = self._lm._contextualizer.get_output_dim()

        if contextual_dim % character_dim != 0:
            raise ConfigurationError(
                    "The output dimensions for the text_field_embedder " +
                    f"({character_dim}) and the contextualizer ({contextual_dim})" +
                    f" from the language model loaded from {archive_file} are " +
                    "not compatible. Please check the config used to train that " +
                    "model and ensure that the output dimension of the " +
                    "text_field_embedder divides the output dimension of the " +
                    "contextualizer.")
        self._character_embedding_duplication_count = contextual_dim // character_dim

        for param in self._lm.parameters():
            param.requires_grad = requires_grad
Exemple #20
0
    def __init__(
        self,
        experiment_name: str,
        vocab: Vocabulary,
        question_embedder: TextFieldEmbedder,
        schema_encoder: Seq2SeqEncoder,
        beam_encoder: Seq2SeqEncoder,
        tree_rep_transformer: Seq2SeqEncoder,
        utterance_augmenter: Seq2SeqEncoder,
        beam_summarizer: Seq2SeqEncoder,
        decoder_timesteps=9,
        beam_size=30,
        misc_params=None,
        dropout: float = 0.1,
    ) -> None:
        super().__init__(vocab)
        self._experiment_name = experiment_name
        self._misc_params = misc_params
        self.set_flags()
        self._utterance_augmenter = utterance_augmenter
        self._action_dim = beam_encoder.get_output_dim()
        self._beam_size = beam_size
        self._n_schema_leafs = 15
        self._num_values = 10

        self.tokenizer = TokenIndexer.by_name("pretrained_transformer")(
            model_name="Salesforce/grappa_large_jnt"
        )._allennlp_tokenizer.tokenizer

        if not self.cntx_reranker:
            self._noreranker_cntx_linear = torch.nn.Linear(
                in_features=self._action_dim,
                out_features=2 * self._action_dim)
        if not self.utt_aug:
            self._nobeam_cntx_linear = torch.nn.Linear(
                in_features=self._action_dim,
                out_features=2 * self._action_dim)
        self.activation_func = torch.nn.ReLU
        if self.lin_after_cntx:
            self.cntx_linear = torch.nn.Sequential(
                torch.nn.Linear(2 * self._action_dim, 4 * self._action_dim),
                torch.nn.Dropout(p=dropout),
                torch.nn.LayerNorm(4 * self._action_dim),
                self.activation_func(),
                torch.nn.Linear(4 * self._action_dim, 2 * self._action_dim),
            )
        if self.cntx_rep:
            self._cntx_rep_linear = torch.nn.Linear(
                in_features=self._action_dim,
                out_features=2 * self._action_dim)
        self._create_action_dicts()
        self.op_count = self.binary_op_count + self.unary_op_count
        self.xent = torch.nn.CrossEntropyLoss()

        self.type_embedding = torch.nn.Embedding(self.op_count,
                                                 self._action_dim)
        self.summrize_vec = torch.nn.Embedding(num_embeddings=1,
                                               embedding_dim=self._action_dim)

        self.d_frontier = 2 * self._action_dim
        self.left_emb = torch.nn.Linear(in_features=self.d_frontier,
                                        out_features=self.d_frontier)
        self.right_emb = torch.nn.Linear(in_features=self.d_frontier,
                                         out_features=self.d_frontier)
        self.after_add = torch.nn.Sequential(
            torch.nn.Linear(self.d_frontier, self.d_frontier),
            torch.nn.Dropout(p=dropout),
            torch.nn.LayerNorm(self.d_frontier),
            self.activation_func(),
            torch.nn.Linear(self.d_frontier, self.d_frontier),
        )
        self._unary_frontier_embedder = torch.nn.Sequential(
            torch.nn.Linear(self.d_frontier, self.d_frontier),
            torch.nn.Dropout(p=dropout),
            torch.nn.LayerNorm(self.d_frontier),
            self.activation_func(),
            torch.nn.Linear(self.d_frontier, self.d_frontier),
        )

        self.op_linear = torch.nn.Linear(in_features=self.d_frontier,
                                         out_features=self.op_count)
        self.pre_op_linear = torch.nn.Sequential(
            torch.nn.Linear(self.d_frontier, self.d_frontier),
            torch.nn.Dropout(p=dropout),
            torch.nn.LayerNorm(self.d_frontier),
            self.activation_func(),
        )

        assert (self._action_dim % 2) == 0
        self.vocab = vocab
        self._question_embedder = question_embedder
        self._schema_encoder = schema_encoder
        self._beam_encoder = beam_encoder
        self._beam_summarizer = beam_summarizer

        self._tree_rep_transformer = tree_rep_transformer

        self._decoder_timesteps = decoder_timesteps
        self._beam_size = beam_size
        self.q_emb_dim = question_embedder.get_output_dim()

        self.dropout_prob = dropout
        self._action_dim = beam_encoder.get_output_dim()
        self._span_score_func = torch.nn.Linear(self._action_dim, 2)
        self._pooler = BagOfEmbeddingsEncoder(embedding_dim=self._action_dim)

        self._rank_schema = torch.nn.Sequential(
            torch.nn.Linear(self._action_dim, self._action_dim),
            torch.nn.Dropout(p=dropout),
            torch.nn.LayerNorm(self._action_dim),
            torch.nn.Tanh(),
            torch.nn.Linear(self._action_dim, 1),
        )
        self._rank_beam = torch.nn.Sequential(
            torch.nn.Linear(2 * self._action_dim, 2 * self._action_dim),
            torch.nn.Dropout(p=dropout),
            torch.nn.LayerNorm(2 * self._action_dim),
            torch.nn.Tanh(),
            torch.nn.Linear(2 * self._action_dim, 1),
        )
        self._emb_to_action_dim = torch.nn.Linear(
            in_features=self.q_emb_dim,
            out_features=self._action_dim,
        )

        self._create_type_tensor()

        self._bce_loss = torch.nn.BCEWithLogitsLoss(reduction="none")

        self._softmax = torch.nn.Softmax(dim=1)
        self._final_beam_acc = Average()
        self._reranker_acc = Average()
        self._spider_acc = Average()

        self._leafs_acc = Average()
        self._batch_size = -1
        self._device = None
        self._evaluate_func = partial(
            evaluate_single,
            db_dir=os.path.join("dataset", "database"),
            table_file=os.path.join("dataset", "tables.json"),
        )