Exemple #1
0
 def __init__(self,
              model: Model,
              dataset_reader: DatasetReader,
              language: str = "en_core_web_sm") -> None:
     super().__init__(model, dataset_reader)
     self._language = language
     self._tokenizer = SpacyTokenizer(language=language, pos_tags=True)
Exemple #2
0
    def test_more_than_two_overlapping_predicates(self):
        """
        Test whether the predictor can correctly consolidate multiword
        predicates.
        """
        tokenizer = SpacyTokenizer(pos_tags=True)

        sent_tokens = tokenizer.tokenize(
            "John refused to consider joining the club.")

        # Emulate predications - for "refused" and "consider" and "joining"
        predictions = [
            [
                "B-ARG0", "B-V", "B-ARG1", "I-ARG1", "I-ARG1", "I-ARG1",
                "I-ARG1", "O"
            ],
            [
                "B-ARG0", "B-BV", "I-BV", "B-V", "B-ARG1", "I-ARG1", "I-ARG1",
                "O"
            ],
            ["B-ARG0", "B-BV", "I-BV", "I-BV", "B-V", "B-ARG1", "I-ARG1", "O"],
        ]

        # Consolidate
        pred_dict = consolidate_predictions(predictions, sent_tokens)

        # Check that only "refused to consider to join" is left
        assert len(pred_dict) == 1
        tags = list(pred_dict.values())[0]
        assert get_predicate_text(sent_tokens,
                                  tags) == "refused to consider joining"
def read(fn: str) -> List[Extraction]:
    tokenizer = SpacyTokenizer(pos_tags=True)
    prev_sent = []

    with open(fn) as fin:
        for line in tqdm(fin):
            data = line.strip().split("\t")
            confidence = data[0]
            if not all(data[2:5]):
                # Make sure that all required elements are present
                continue
            arg1, rel, args2 = map(parse_element, data[2:5])

            # Exactly one subject and one relation
            # and at least one object
            if (len(rel) == 1) and (len(arg1) == 1) and (len(args2) >= 1):
                sent = data[5]
                cur_ex = Extraction(
                    sent=sent,
                    toks=tokenizer.tokenize(sent),
                    arg1=arg1[0],
                    rel=rel[0],
                    args2=args2,
                    confidence=confidence,
                )

                # Decide whether to append or yield
                if (not prev_sent) or (prev_sent[0].sent == sent):
                    prev_sent.append(cur_ex)
                else:
                    yield prev_sent
                    prev_sent = [cur_ex]
    if prev_sent:
        # Yield last element
        yield prev_sent
Exemple #4
0
    def test_predicate_consolidation(self):
        """
        Test whether the predictor can correctly consolidate multiword
        predicates.
        """
        tokenizer = SpacyTokenizer(pos_tags=True)

        sent_tokens = tokenizer.tokenize(
            "In December, John decided to join the party.")

        # Emulate predications - for both "decided" and "join"
        predictions = [
            [
                "B-ARG2", "I-ARG2", "O", "B-ARG0", "B-V", "B-ARG1", "I-ARG1",
                "I-ARG1", "I-ARG1", "O"
            ],
            [
                "O", "O", "O", "B-ARG0", "B-BV", "I-BV", "B-V", "B-ARG1",
                "I-ARG1", "O"
            ],
        ]
        # Consolidate
        pred_dict = consolidate_predictions(predictions, sent_tokens)

        # Check that only "decided to join" is left
        assert len(pred_dict) == 1
        tags = list(pred_dict.values())[0]
        assert get_predicate_text(sent_tokens, tags) == "decided to join"
 def __init__(self,
              model: Model,
              dataset_reader: DatasetReader,
              language: str = "en_core_web_sm") -> None:
     super().__init__(model, dataset_reader)
     # TODO(Mark) Make the language configurable and based on a model attribute.
     self._tokenizer = SpacyTokenizer(language=language, pos_tags=True)
class TestNerTagIndexer(AllenNlpTestCase):
    def setUp(self):
        super().setUp()
        self.tokenizer = SpacyTokenizer(ner=True)

    def test_count_vocab_items_uses_ner_tags(self):
        tokens = self.tokenizer.tokenize("Larry Page is CEO of Google.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = NerTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["ner_tokens"] == {"PERSON": 2, "ORG": 1, "NONE": 6}

    def test_tokens_to_indices_uses_ner_tags(self):
        tokens = self.tokenizer.tokenize("Larry Page is CEO of Google.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        person_index = vocab.add_token_to_namespace("PERSON",
                                                    namespace="ner_tags")
        none_index = vocab.add_token_to_namespace("NONE", namespace="ner_tags")
        vocab.add_token_to_namespace("ORG", namespace="ner_tags")
        indexer = NerTagIndexer(namespace="ner_tags")
        assert indexer.tokens_to_indices([tokens[1]], vocab, "tokens1") == {
            "tokens1": [person_index]
        }
        assert indexer.tokens_to_indices([tokens[-1]], vocab, "tokens-1") == {
            "tokens-1": [none_index]
        }

    def test_padding_functions(self):
        indexer = NerTagIndexer()
        assert indexer.get_padding_lengths(0) == {}

    def test_as_array_produces_token_sequence(self):
        indexer = NerTagIndexer()
        padded_tokens = indexer.as_padded_tensor({"key": [1, 2, 3, 4, 5]},
                                                 {"key": 10}, {})
        assert padded_tokens["key"].tolist() == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]

    def test_blank_ner_tag(self):
        tokens = [
            Token(token)._replace(ent_type_="")
            for token in "allennlp is awesome .".split(" ")
        ]
        indexer = NerTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        # spacy uses a empty string to indicate "no NER tag"
        # we convert it to "NONE"
        assert counter["ner_tokens"]["NONE"] == 4
        vocab = Vocabulary(counter)
        none_index = vocab.get_token_index("NONE", "ner_tokens")
        # should raise no exception
        indices = indexer.tokens_to_indices(tokens, vocab, index_name="ner")
        assert {
            "ner": [none_index, none_index, none_index, none_index]
        } == indices
 def __init__(self,
              model: Model,
              dataset_reader: DatasetReader,
              language: Optional[str] = "en_core_web_sm",
              nr_samples: Optional[int] = 250,
              batch_size: Optional[int] = 32):
     super().__init__(model, dataset_reader, language)
     self.nr_samples = nr_samples
     self.batch_size = batch_size
     self._tokenizer = SpacyTokenizer(language=language, pos_tags=True)
Exemple #8
0
 def _json_to_instance(self, json_dict: JsonDict) -> Instance:
     """
     Expects JSON that looks like `{"sentence": "..."}`.
     Runs the underlying model, and adds the `"label"` to the output.
     """
     sentence = json_dict["sentence"]
     if not hasattr(self._dataset_reader, "tokenizer") and not hasattr(
             self._dataset_reader, "_tokenizer"):
         tokenizer = SpacyTokenizer()
         sentence = [str(t) for t in tokenizer.tokenize(sentence)]
     return self._dataset_reader.text_to_instance(sentence)
 def _json_to_instance(self, json_dict: JsonDict) -> Instance:
     """
     Expects JSON that looks like `{"sentence": "..."}`.
     Runs the underlying model, and adds the `"label"` to the output.
     """
     sentence = json_dict["sentence"]
     reader_has_tokenizer = (
         getattr(self._dataset_reader, "tokenizer", None) is not None
         or getattr(self._dataset_reader, "_tokenizer", None) is not None)
     if not reader_has_tokenizer:
         tokenizer = SpacyTokenizer()
         sentence = tokenizer.tokenize(sentence)
     return self._dataset_reader.text_to_instance(sentence)
Exemple #10
0
    def test_enumerate_spans_enumerates_all_spans(self):
        tokenizer = SpacyTokenizer(pos_tags=True)
        sentence = tokenizer.tokenize("This is a sentence.")

        spans = span_utils.enumerate_spans(sentence)
        assert spans == [
            (0, 0),
            (0, 1),
            (0, 2),
            (0, 3),
            (0, 4),
            (1, 1),
            (1, 2),
            (1, 3),
            (1, 4),
            (2, 2),
            (2, 3),
            (2, 4),
            (3, 3),
            (3, 4),
            (4, 4),
        ]

        spans = span_utils.enumerate_spans(sentence,
                                           max_span_width=3,
                                           min_span_width=2)
        assert spans == [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (2, 4),
                         (3, 4)]

        spans = span_utils.enumerate_spans(sentence,
                                           max_span_width=3,
                                           min_span_width=2,
                                           offset=20)
        assert spans == [(20, 21), (20, 22), (21, 22), (21, 23), (22, 23),
                         (22, 24), (23, 24)]

        def no_prefixed_punctuation(tokens: List[Token]):
            # Only include spans which don't start or end with punctuation.
            return tokens[0].pos_ != "PUNCT" and tokens[-1].pos_ != "PUNCT"

        spans = span_utils.enumerate_spans(
            sentence,
            max_span_width=3,
            min_span_width=2,
            filter_function=no_prefixed_punctuation)

        # No longer includes (2, 4) or (3, 4) as these include punctuation
        # as their last element.
        assert spans == [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3)]
Exemple #11
0
def add_perturbations(df: pd.DataFrame,
                      tokenizer,
                      sentence_col_name: str,
                      perturbation_functions,
                      seed=3):
    """
    Apply multiple perturbations, generating a new column for each perturbation

    :param tokenizer: SpacyTokenizer or BertTokenizer
    :param df: DataFrame containing sentences
    :param sentence_col_name: Name of column containing sentence to be perturbed
    :param perturbation_functions: List of perturbation functions
    :param seed: Random seed
    :return: DataFrame with additional columns containing perturbed sentences and success flags
    """
    df = df.copy()

    tokenizer = tokenizer or SpacyTokenizer()

    orig_index = df.columns.get_loc(sentence_col_name)

    tokens_orig = [[
        str(x) for x in tokenizer.tokenize(df.iloc[i, orig_index])
    ] for i in range(len(df))]

    df['tokens_orig'] = tokens_orig

    np.random.seed(seed)  # Set seed as some perturbations are stochastic

    for perturbation in perturbation_functions:
        perturbation(df, orig_index, tokens_orig)

    return df
Exemple #12
0
class DialogQAPredictor(Predictor):
    def __init__(
        self, model: Model, dataset_reader: DatasetReader, language: str = "en_core_web_sm"
    ) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyTokenizer(language=language)

    def predict(self, jsonline: str) -> JsonDict:
        """
        Make a dialog-style question answering prediction on the supplied input.
        The supplied input json must contain a list of
        question answer pairs, containing question, answer, yesno, followup, id
        as well as the context (passage).

        Parameters
        ----------
        jsonline : ``str``
            A json line that has the same format as the quac data file.

        Returns
        ----------
        A dictionary that represents the prediction made by the system.  The answer string will be under the
        "best_span_str" key.
        """
        return self.predict_json(json.loads(jsonline))

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects json that looks like the original quac data file.
        """
        paragraph_json = json_dict["paragraphs"][0]
        paragraph = paragraph_json["context"]
        tokenized_paragraph = self._tokenizer.tokenize(paragraph)
        qas = paragraph_json["qas"]
        metadata = {}
        metadata["instance_id"] = [qa["id"] for qa in qas]
        question_text_list = [qa["question"].strip().replace("\n", "") for qa in qas]
        answer_texts_list = [[answer["text"] for answer in qa["answers"]] for qa in qas]
        metadata["answer_texts_list"] = answer_texts_list
        metadata["question"] = question_text_list
        span_starts_list = [[answer["answer_start"] for answer in qa["answers"]] for qa in qas]
        span_ends_list = []
        for st_list, an_list in zip(span_starts_list, answer_texts_list):
            span_ends = [start + len(answer) for start, answer in zip(st_list, an_list)]
            span_ends_list.append(span_ends)
        yesno_list = [str(qa["yesno"]) for qa in qas]
        followup_list = [str(qa["followup"]) for qa in qas]
        instance = self._dataset_reader.text_to_instance(
            question_text_list,
            paragraph,
            span_starts_list,
            span_ends_list,
            tokenized_paragraph,
            yesno_list,
            followup_list,
            metadata,
        )
        return instance
Exemple #13
0
    def _read(self, file_path: str) -> Iterator[Instance]:
        tokenizer = SpacyTokenizer()
        root = ElementTree.parse(file_path).getroot()
        xml_sents = root.findall("./sentence")

        for xml_sent in tqdm(xml_sents):
            text = xml_sent.find("text").text
            annotations = xml_sent.find('aspectTerms')
            if annotations is not None:
                annotations = annotations.findall("aspectTerm")
            else:
                annotations = []

            # Sorts the annotations by start character
            annotations.sort(key=lambda x: int(x.get('from')))

            # Tokenizes the sentence
            tokens = tokenizer.tokenize(text)

            # Assigns tags based on annotations
            tags = []
            next = 0
            current = None
            for token in tokens:
                # Checks if the next annotation begins somewhere in this token
                start_entity = next < len(annotations)
                start_entity = start_entity and token.idx <= int(
                    annotations[next].get('from'))
                start_entity = start_entity and token.idx + len(
                    token.text) > int(annotations[next].get('from'))

                if start_entity:
                    tags.append('I' if current is None else 'B')
                    current = annotations[next]
                    next += 1
                elif current is not None:
                    if token.idx < int(current.get('to')):
                        tags.append('I')
                    else:
                        tags.append('O')
                        current = None
                else:
                    tags.append('O')

            yield self.text_to_instance(xml_sent.get('id'), tokens, tags)
class TestDepLabelIndexer(AllenNlpTestCase):
    def setUp(self):
        super().setUp()
        self.tokenizer = SpacyTokenizer(parse=True)

    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.tokenize("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        indexer = DepLabelIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)

        assert counter["dep_labels"] == {
            "ROOT": 1,
            "nsubj": 1,
            "det": 1,
            "NONE": 2,
            "attr": 1,
            "punct": 1,
        }

    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.tokenize("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        vocab = Vocabulary()
        root_index = vocab.add_token_to_namespace("ROOT", namespace="dep_labels")
        none_index = vocab.add_token_to_namespace("NONE", namespace="dep_labels")
        indexer = DepLabelIndexer()
        assert indexer.tokens_to_indices([tokens[1]], vocab) == {"tokens": [root_index]}
        assert indexer.tokens_to_indices([tokens[-1]], vocab) == {"tokens": [none_index]}

    def test_as_array_produces_token_sequence(self):
        indexer = DepLabelIndexer()
        padded_tokens = indexer.as_padded_tensor_dict({"tokens": [1, 2, 3, 4, 5]}, {"tokens": 10})
        assert padded_tokens["tokens"].tolist() == [1, 2, 3, 4, 5, 0, 0, 0, 0, 0]
    def test_keep_spacy_tokens(self):
        word_tokenizer = SpacyTokenizer()
        sentence = "This should be an allennlp Token"
        tokens = word_tokenizer.tokenize(sentence)
        assert tokens
        assert all(isinstance(token, Token) for token in tokens)

        word_tokenizer = SpacyTokenizer(keep_spacy_tokens=True)
        sentence = "This should be a spacy Token"
        tokens = word_tokenizer.tokenize(sentence)
        assert tokens
        assert all(isinstance(token, spacy.tokens.Token) for token in tokens)
class SentenceTaggerPredictor(Predictor):
    """
    Predictor for any model that takes in a sentence and returns
    a single set of tags for it.  In particular, it can be used with
    the [`CrfTagger`](../models/crf_tagger.md) model
    and also the [`SimpleTagger`](../models/simple_tagger.md) model.
    """
    def __init__(self,
                 model: Model,
                 dataset_reader: DatasetReader,
                 language: str = "en_core_web_sm") -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyTokenizer(language=language, pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like `{"sentence": "..."}`.
        Runs the underlying model, and adds the `"words"` to the output.
        """
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.tokenize(sentence)
        return self._dataset_reader.text_to_instance(tokens)

    @overrides
    def predictions_to_labeled_instances(
            self, instance: Instance,
            outputs: Dict[str, numpy.ndarray]) -> List[Instance]:
        """
        This function currently only handles BIOUL tags.

        Imagine an NER model predicts three named entities (each one with potentially
        multiple tokens). For each individual entity, we create a new Instance that has
        the label set to only that entity and the rest of the tokens are labeled as outside.
        We then return a list of those Instances.

        For example:
        Mary  went to Seattle to visit Microsoft Research
        U-Per  O    O   U-Loc  O   O     B-Org     L-Org

        We create three instances.
        Mary  went to Seattle to visit Microsoft Research
        U-Per  O    O    O     O   O       O         O

        Mary  went to Seattle to visit Microsoft Research
        O      O    O   U-LOC  O   O       O         O

        Mary  went to Seattle to visit Microsoft Research
        O      O    O    O     O   O     B-Org     L-Org
        """
        predicted_tags = outputs["tags"]
        predicted_spans = []

        i = 0
        while i < len(predicted_tags):
            tag = predicted_tags[i]
            # if its a U, add it to the list
            if tag[0] == "U":
                current_tags = [
                    t if idx == i else "O"
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            # if its a B, keep going until you hit an L.
            elif tag[0] == "B":
                begin_idx = i
                while tag[0] != "L":
                    i += 1
                    tag = predicted_tags[i]
                end_idx = i
                current_tags = [
                    t if begin_idx <= idx <= end_idx else "O"
                    for idx, t in enumerate(predicted_tags)
                ]
                predicted_spans.append(current_tags)
            i += 1

        # Creates a new instance for each contiguous tag
        instances = []
        for labels in predicted_spans:
            new_instance = deepcopy(instance)
            text_field: TextField = instance["tokens"]  # type: ignore
            new_instance.add_field("tags",
                                   SequenceLabelField(labels, text_field),
                                   self._model.vocab)
            instances.append(new_instance)
        instances.reverse(
        )  # NER tags are in the opposite order as desired for the interpret UI

        return instances
Exemple #17
0
class BiaffineDependencyParserPredictor(Predictor):
    """
    Predictor for the [`BiaffineDependencyParser`](../models/biaffine_dependency_parser.md) model.
    """

    def __init__(
        self, model: Model, dataset_reader: DatasetReader, language: str = "en_core_web_sm"
    ) -> None:
        super().__init__(model, dataset_reader)
        # TODO(Mark) Make the language configurable and based on a model attribute.
        self._tokenizer = SpacyTokenizer(language=language, pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        """
        Predict a dependency parse for the given sentence.
        # Parameters

        sentence The sentence to parse.

        # Returns

        A dictionary representation of the dependency tree.
        """
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like `{"sentence": "..."}`.
        """
        spacy_tokens = self._tokenizer.tokenize(json_dict["sentence"])
        sentence_text = [token.text for token in spacy_tokens]
        if self._dataset_reader.use_language_specific_pos:  # type: ignore
            # fine-grained part of speech
            pos_tags = [token.tag_ for token in spacy_tokens]
        else:
            # coarse-grained part of speech (Universal Depdendencies format)
            pos_tags = [token.pos_ for token in spacy_tokens]
        return self._dataset_reader.text_to_instance(sentence_text, pos_tags)

    @overrides
    def predict_instance(self, instance: Instance) -> JsonDict:
        outputs = self._model.forward_on_instance(instance)

        words = outputs["words"]
        pos = outputs["pos"]
        heads = outputs["predicted_heads"]
        tags = outputs["predicted_dependencies"]
        outputs["hierplane_tree"] = self._build_hierplane_tree(words, heads, tags, pos)
        return sanitize(outputs)

    @overrides
    def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
        outputs = self._model.forward_on_instances(instances)
        for output in outputs:
            words = output["words"]
            pos = output["pos"]
            heads = output["predicted_heads"]
            tags = output["predicted_dependencies"]
            output["hierplane_tree"] = self._build_hierplane_tree(words, heads, tags, pos)
        return sanitize(outputs)

    @staticmethod
    def _build_hierplane_tree(
        words: List[str], heads: List[int], tags: List[str], pos: List[str]
    ) -> Dict[str, Any]:
        """
        # Returns

        A JSON dictionary render-able by Hierplane for the given tree.
        """

        word_index_to_cumulative_indices: Dict[int, Tuple[int, int]] = {}
        cumulative_index = 0
        for i, word in enumerate(words):
            word_length = len(word) + 1
            word_index_to_cumulative_indices[i] = (cumulative_index, cumulative_index + word_length)
            cumulative_index += word_length

        def node_constuctor(index: int):
            children = []
            for next_index, child in enumerate(heads):
                if child == index + 1:
                    children.append(node_constuctor(next_index))

            # These are the icons which show up in the bottom right
            # corner of the node.
            attributes = [pos[index]]
            start, end = word_index_to_cumulative_indices[index]

            hierplane_node = {
                "word": words[index],
                # The type of the node - all nodes with the same
                # type have a unified colour.
                "nodeType": tags[index],
                # Attributes of the node.
                "attributes": attributes,
                # The link between  the node and it's parent.
                "link": tags[index],
                "spans": [{"start": start, "end": end}],
            }
            if children:
                hierplane_node["children"] = children
            return hierplane_node

        # We are guaranteed that there is a single word pointing to
        # the root index, so we can find it just by searching for 0 in the list.
        root_index = heads.index(0)
        hierplane_tree = {
            "text": " ".join(words),
            "root": node_constuctor(root_index),
            "nodeTypeToStyle": NODE_TYPE_TO_STYLE,
            "linkToPosition": LINK_TO_POSITION,
        }
        return hierplane_tree
Exemple #18
0
 def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
     super().__init__(model, dataset_reader)
     self._tokenizer = dataset_reader._tokenizer or SpacyTokenizer()
 def setup_method(self):
     super().setup_method()
     self.word_tokenizer = SpacyTokenizer()
class ConstituencyParserPredictor(Predictor):
    """
    Predictor for the :class:`~allennlp.models.SpanConstituencyParser` model.
    """
    def __init__(
        self,
        model: Model,
        dataset_reader: DatasetReader,
        language: str = "en_core_web_sm",
    ) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyTokenizer(language=language,
                                         pos_tags=True,
                                         split_on_spaces=True)

    def predict(self, sentence: str) -> JsonDict:
        """
        Predict a constituency parse for the given sentence.
        Parameters
        ----------
        sentence The sentence to parse.

        Returns
        -------
        A dictionary representation of the constituency tree.
        """
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like ``{"sentence": "..."}``.
        """
        spacy_tokens = self._tokenizer.tokenize(json_dict["sentence"])
        sentence_text = [token.text for token in spacy_tokens]
        pos_tags = [token.tag_ for token in spacy_tokens]
        sentence_id = None
        if "sentence_id" in json_dict:
            sentence_id = json_dict["sentence_id"]

        instance: Instance = self._dataset_reader.text_to_instance(
            sentence_text, pos_tags)
        if sentence_id is not None:
            fields = instance.fields
            # Adding sentence_id to the metadata
            metadata = fields["metadata"].metadata
            metadata["sentence_id"] = sentence_id
            fields["metadata"] = MetadataField(metadata)
            return Instance(fields)
        else:
            return instance

    @overrides
    def predict_instance(self, instance: Instance) -> JsonDict:
        outputs = self._model.forward_on_instance(instance)

        # format the NLTK tree as a string on a single line.
        tree = outputs.pop("trees")
        outputs["hierplane_tree"] = self._build_hierplane_tree(tree,
                                                               0,
                                                               0,
                                                               is_root=True)
        outputs["trees"] = tree.pformat(margin=1000000)
        outputs['nltk_tree'] = tree
        return sanitize(outputs)

    @overrides
    def predict_batch_instance(self,
                               instances: List[Instance]) -> List[JsonDict]:
        outputs = self._model.forward_on_instances(instances)
        for output in outputs:
            # format the NLTK tree as a string on a single line.
            tree = output.pop("trees")
            output["hierplane_tree"] = self._build_hierplane_tree(tree,
                                                                  0,
                                                                  0,
                                                                  is_root=True)
            output["trees"] = tree.pformat(margin=1000000)
            output['nltk_tree'] = tree
        return sanitize(outputs)

    def get_parse_spans(self, node, spans):
        if "root" in node:
            spans = self.get_parse_spans(node["root"], spans)
        else:
            nodelabel = node["nodeType"]
            spans.append((node["start"], node["end"], node["word"], nodelabel))
            if 'children' in node:
                for child in node['children']:
                    spans = self.get_parse_spans(child, spans)
        return spans

    @overrides
    def dump_line(self, output: JsonDict) -> str:  # pylint: disable=no-self-use
        output.pop('class_probabilities', None)
        output['hierplane_tree'].pop('linkNameToLabel', None)
        output['hierplane_tree'].pop('nodeTypeToStyle', None)
        tree = output['hierplane_tree']
        # Spans are 4-tuple with (start, end (exclusive), span_text, span_label)
        spans = self.get_parse_spans(tree, [])
        sentence_id = None
        if "sentence_id" in output['metadata']:
            sentence_id = output['metadata']['sentence_id']
        tokens = output['metadata']['tokens']

        output_jsonl_dict = {
            'sentence_id': sentence_id,
            'tokens': tokens,
            'spans': spans
        }
        return json.dumps(output_jsonl_dict) + "\n"

    def _build_hierplane_tree(self, tree: Tree, index: int,
                              start_token_index: int,
                              is_root: bool) -> JsonDict:
        """
        Recursively builds a JSON dictionary from an NLTK ``Tree`` suitable for
        rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`.

        Parameters
        ----------
        tree : ``Tree``, required.
            The tree to convert into Hierplane JSON.
        index : int, required.
            The character index into the tree, used for creating spans.
        start_token_index : int, required.
            The token idx of the left-most-leaf of this tree used for storing span indices in the tree nodes
        is_root : bool
            An indicator which allows us to add the outer Hierplane JSON which
            is required for rendering.

        Returns
        -------
        A JSON dictionary render-able by Hierplane for the given tree.
        """
        children = []
        prev_children_num_tokens = 0
        for child in tree:
            if isinstance(child, Tree):
                # If the child is a tree, it has children,
                # as NLTK leaves are just strings.
                children.append(
                    self._build_hierplane_tree(child,
                                               index,
                                               start_token_index +
                                               prev_children_num_tokens,
                                               is_root=False))
                # The next child's starting index is offset by sum of length of all children to the left of it
                num_tokens_child = len(child.leaves())
                prev_children_num_tokens += num_tokens_child
            else:
                # We're at a leaf, so add the length of
                # the word to the character index.
                index += len(child)

        label = tree.label()
        span = " ".join(tree.leaves())
        # Span indexing works because children are traversed in a left-to-right manner in this NLTK tree as the
        # ConstituencyParser model makes these trees in that manner
        num_tokens = len(span.split(" "))
        hierplane_node = {
            "word": span,
            "start": start_token_index,
            "end": start_token_index + num_tokens,
            "nodeType": label,
            "attributes": [label],
            "link": label,
        }
        if children:
            hierplane_node["children"] = children
        # TODO(Mark): Figure out how to span highlighting to the leaves.
        if is_root:
            hierplane_node = {
                "linkNameToLabel": LINK_TO_LABEL,
                "nodeTypeToStyle": NODE_TYPE_TO_STYLE,
                "text": span,
                "root": hierplane_node,
            }
        return hierplane_node
Exemple #21
0
class ConstituencyParserPredictor(Predictor):
    """
    Predictor for the :class:`~allennlp.models.SpanConstituencyParser` model.
    """

    def __init__(
        self, model: Model, dataset_reader: DatasetReader, language: str = "en_core_web_sm"
    ) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyTokenizer(language=language, pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        """
        Predict a constituency parse for the given sentence.
        # Parameters

        sentence The sentence to parse.

        # Returns

        A dictionary representation of the constituency tree.
        """
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like `{"sentence": "..."}`.
        """
        spacy_tokens = self._tokenizer.tokenize(json_dict["sentence"])
        sentence_text = [token.text for token in spacy_tokens]
        pos_tags = [token.tag_ for token in spacy_tokens]
        return self._dataset_reader.text_to_instance(sentence_text, pos_tags)

    @overrides
    def predict_instance(self, instance: Instance) -> JsonDict:
        outputs = self._model.forward_on_instance(instance)

        # format the NLTK tree as a string on a single line.
        tree = outputs.pop("trees")
        outputs["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True)
        outputs["trees"] = tree.pformat(margin=1000000)
        return sanitize(outputs)

    @overrides
    def predict_batch_instance(self, instances: List[Instance]) -> List[JsonDict]:
        outputs = self._model.forward_on_instances(instances)
        for output in outputs:
            # format the NLTK tree as a string on a single line.
            tree = output.pop("trees")
            output["hierplane_tree"] = self._build_hierplane_tree(tree, 0, is_root=True)
            output["trees"] = tree.pformat(margin=1000000)
        return sanitize(outputs)

    def _build_hierplane_tree(self, tree: Tree, index: int, is_root: bool) -> JsonDict:
        """
        Recursively builds a JSON dictionary from an NLTK `Tree` suitable for
        rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`.

        # Parameters

        tree : `Tree`, required.
            The tree to convert into Hierplane JSON.
        index : int, required.
            The character index into the tree, used for creating spans.
        is_root : bool
            An indicator which allows us to add the outer Hierplane JSON which
            is required for rendering.

        # Returns

        A JSON dictionary render-able by Hierplane for the given tree.
        """
        children = []
        for child in tree:
            if isinstance(child, Tree):
                # If the child is a tree, it has children,
                # as NLTK leaves are just strings.
                children.append(self._build_hierplane_tree(child, index, is_root=False))
            else:
                # We're at a leaf, so add the length of
                # the word to the character index.
                index += len(child)

        label = tree.label()
        span = " ".join(tree.leaves())
        hierplane_node = {"word": span, "nodeType": label, "attributes": [label], "link": label}
        if children:
            hierplane_node["children"] = children
        # TODO(Mark): Figure out how to span highlighting to the leaves.
        if is_root:
            hierplane_node = {
                "linkNameToLabel": LINK_TO_LABEL,
                "nodeTypeToStyle": NODE_TYPE_TO_STYLE,
                "text": span,
                "root": hierplane_node,
            }
        return hierplane_node
class MCDropoutSentenceTaggerPredictor(Predictor):
    def __init__(self,
                 model: Model,
                 dataset_reader: DatasetReader,
                 language: Optional[str] = "en_core_web_sm",
                 nr_samples: Optional[int] = 250,
                 batch_size: Optional[int] = 32):
        super().__init__(model, dataset_reader, language)
        self.nr_samples = nr_samples
        self.batch_size = batch_size
        self._tokenizer = SpacyTokenizer(language=language, pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"sentence": sentence})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like `{"sentence": "..."}`.
        Runs the underlying model, and adds the `"words"` to the output.
        """
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.tokenize(sentence)
        return self._dataset_reader.text_to_instance(tokens)

    @overrides
    def predict_instance(self, instance: Instance) -> JsonDict:

        sub_models = self._model.get_all_models()

        # turn on dropout
        for model in sub_models.values():
            for module in model.modules():
                if isinstance(module, InputVariationalDropout):
                    module.train()

        # forward `nr_samples` amount of times in batches
        all_outputs = []

        batch_sizes = [self.batch_size] * (self.nr_samples // self.batch_size)
        if self.nr_samples % self.batch_size > 0:
            batch_sizes.append(self.nr_samples % self.batch_size)

        for batch_size in batch_sizes:
            batch = [instance] * batch_size
            outputs = self._model.forward_on_instances(batch)
            all_outputs.extend(outputs)

        final_outputs = {'words': all_outputs[0]['meta_words']}

        for model_key in sub_models.keys():

            # collect class probabilities
            all_class_probs = []
            for outputs in all_outputs:
                tensor = torch.from_numpy(
                    outputs[f'{model_key}_class_probabilities'])
                all_class_probs.append(tensor)
            all_class_probs = torch.stack(all_class_probs)

            # calculate mean and variance
            mean = all_class_probs.mean(dim=0)
            std = all_class_probs.std(dim=0)

            final_outputs[f'{model_key}_class_probabilities'] = mean
            final_outputs[f'{model_key}_class_prob_std'] = std

        return sanitize(final_outputs)

    @overrides
    def predictions_to_labeled_instances(
            self, instance: Instance,
            outputs: Dict[str, np.ndarray]) -> List[Instance]:
        new_instance = instance.duplicate()
        text_field: TextField = instance["tokens"]
        for name in self._model.all_model_keys:
            predicted_tags = np.argmax(outputs[f"{name}_class_probabilities"],
                                       axis=-1)[:len(text_field)].tolist()
            new_instance.add_field(
                f"{name}_tags", SequenceLabelField(predicted_tags, text_field),
                self._model.vocab)

        return [new_instance]
 def setUp(self):
     super().setUp()
     self.tokenizer = SpacyTokenizer(pos_tags=True)
class TestPosTagIndexer(AllenNlpTestCase):
    def setUp(self):
        super().setUp()
        self.tokenizer = SpacyTokenizer(pos_tags=True)

    def test_count_vocab_items_uses_pos_tags(self):
        tokens = self.tokenizer.tokenize("This is a sentence.")
        tokens = [Token("<S>")] + [t for t in tokens] + [Token("</S>")]
        # Hard-coding this because spacy's POS tagger keeps changing on us, wanting to call this AUX
        # in some runs.
        tokens[2] = Token("is", tag_="VBZ", pos_="VERB")
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tokens"] == {
            "DT": 2,
            "VBZ": 1,
            ".": 1,
            "NN": 1,
            "NONE": 2
        }

        indexer._coarse_tags = True
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        assert counter["pos_tokens"] == {
            "VERB": 1,
            "PUNCT": 1,
            "DET": 2,
            "NOUN": 1,
            "NONE": 2
        }

    def test_tokens_to_indices_uses_pos_tags(self):
        tokens = self.tokenizer.tokenize("This is a sentence.")
        tokens = [t for t in tokens] + [Token("</S>")]
        tokens[1] = Token("is", tag_="VBZ", pos_="VERB")
        vocab = Vocabulary()
        verb_index = vocab.add_token_to_namespace("VERB", namespace="pos_tags")
        cop_index = vocab.add_token_to_namespace("VBZ", namespace="pos_tags")
        none_index = vocab.add_token_to_namespace("NONE", namespace="pos_tags")
        # Have to add other tokens too, since we're calling `tokens_to_indices` on all of them
        vocab.add_token_to_namespace("DET", namespace="pos_tags")
        vocab.add_token_to_namespace("NOUN", namespace="pos_tags")
        vocab.add_token_to_namespace("PUNCT", namespace="pos_tags")

        indexer = PosTagIndexer(namespace="pos_tags", coarse_tags=True)

        indices = indexer.tokens_to_indices(tokens, vocab)
        assert len(indices) == 1
        assert "tokens" in indices
        assert indices["tokens"][1] == verb_index
        assert indices["tokens"][-1] == none_index

        indexer._coarse_tags = False
        assert indexer.tokens_to_indices([tokens[1]], vocab) == {
            "tokens": [cop_index]
        }

    def test_as_array_produces_token_sequence(self):
        indexer = PosTagIndexer()
        padded_tokens = indexer.as_padded_tensor_dict(
            {"tokens": [1, 2, 3, 4, 5]}, {"tokens": 10})
        assert padded_tokens["tokens"].tolist() == [
            1, 2, 3, 4, 5, 0, 0, 0, 0, 0
        ]

    def test_blank_pos_tag(self):
        tokens = [
            Token(token)._replace(pos_="")
            for token in "allennlp is awesome .".split(" ")
        ]
        indexer = PosTagIndexer()
        counter = defaultdict(lambda: defaultdict(int))
        for token in tokens:
            indexer.count_vocab_items(token, counter)
        # spacy uses a empty string to indicate "no POS tag"
        # we convert it to "NONE"
        assert counter["pos_tokens"]["NONE"] == 4
        vocab = Vocabulary(counter)
        none_index = vocab.get_token_index("NONE", "pos_tokens")
        # should raise no exception
        indices = indexer.tokens_to_indices(tokens, vocab)
        assert {
            "tokens": [none_index, none_index, none_index, none_index]
        } == indices
 def setUp(self):
     super().setUp()
     self.tokenizer = SpacyTokenizer(ner=True)
class SemanticRoleLabelerPredictor(Predictor):
    """
    Predictor for the [`SemanticRoleLabeler`](../models/semantic_role_labeler.md) model.
    """
    def __init__(self,
                 model: Model,
                 dataset_reader: DatasetReader,
                 language: str = "en_core_web_sm") -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyTokenizer(language=language, pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        """
        Predicts the semantic roles of the supplied sentence and returns a dictionary
        with the results.

        ```
        {"words": [...],
         "verbs": [
            {"verb": "...", "description": "...", "tags": [...]},
            ...
            {"verb": "...", "description": "...", "tags": [...]},
        ]}
        ```

        # Parameters

        sentence, `str`
            The sentence to parse via semantic role labeling.

        # Returns

        A dictionary representation of the semantic roles in the sentence.
        """
        return self.predict_json({"sentence": sentence})

    def predict_tokenized(self, tokenized_sentence: List[str]) -> JsonDict:
        """
        Predicts the semantic roles of the supplied sentence tokens and returns a dictionary
        with the results.

        # Parameters

        tokenized_sentence, `List[str]`
            The sentence tokens to parse via semantic role labeling.

        # Returns

        A dictionary representation of the semantic roles in the sentence.
        """
        spacy_doc = Doc(self._tokenizer.spacy.vocab, words=tokenized_sentence)
        for pipe in filter(None, self._tokenizer.spacy.pipeline):
            pipe[1](spacy_doc)

        tokens = [token for token in spacy_doc]
        instances = self.tokens_to_instances(tokens)

        if not instances:
            return sanitize({"verbs": [], "words": tokens})

        return self.predict_instances(instances)

    @staticmethod
    def make_srl_string(words: List[str], tags: List[str]) -> str:
        frame = []
        chunk = []

        for (token, tag) in zip(words, tags):
            if tag.startswith("I-"):
                chunk.append(token)
            else:
                if chunk:
                    frame.append("[" + " ".join(chunk) + "]")
                    chunk = []

                if tag.startswith("B-"):
                    chunk.append(tag[2:] + ": " + token)
                elif tag == "O":
                    frame.append(token)

        if chunk:
            frame.append("[" + " ".join(chunk) + "]")

        return " ".join(frame)

    @overrides
    def _json_to_instance(self, json_dict: JsonDict):
        raise NotImplementedError(
            "The SRL model uses a different API for creating instances.")

    def tokens_to_instances(self, tokens):
        words = [token.text for token in tokens]
        instances: List[Instance] = []
        for i, word in enumerate(tokens):
            if word.pos_ == "VERB":
                verb_labels = [0 for _ in words]
                verb_labels[i] = 1
                instance = self._dataset_reader.text_to_instance(
                    tokens, verb_labels)
                instances.append(instance)
        return instances

    def _sentence_to_srl_instances(self,
                                   json_dict: JsonDict) -> List[Instance]:
        """
        The SRL model has a slightly different API from other models, as the model is run
        forward for every verb in the sentence. This means that for a single sentence, we need
        to generate a `List[Instance]`, where the length of this list corresponds to the number
        of verbs in the sentence. Additionally, all of these verbs share the same return dictionary
        after being passed through the model (as really we care about all the frames of the sentence
        together, rather than separately).

        # Parameters

        json_dict : `JsonDict`, required.
            JSON that looks like `{"sentence": "..."}`.

        # Returns

        instances : `List[Instance]`
            One instance per verb.
        """
        sentence = json_dict["sentence"]
        tokens = self._tokenizer.tokenize(sentence)
        return self.tokens_to_instances(tokens)

    @overrides
    def predict_batch_json(self, inputs: List[JsonDict]) -> List[JsonDict]:
        """
        Expects JSON that looks like `[{"sentence": "..."}, {"sentence": "..."}, ...]`
        and returns JSON that looks like

        ```
        [
            {"words": [...],
             "verbs": [
                {"verb": "...", "description": "...", "tags": [...]},
                ...
                {"verb": "...", "description": "...", "tags": [...]},
            ]},
            {"words": [...],
             "verbs": [
                {"verb": "...", "description": "...", "tags": [...]},
                ...
                {"verb": "...", "description": "...", "tags": [...]},
            ]}
        ]
        ```
        """
        # For SRL, we have more instances than sentences, but the user specified
        # a batch size with respect to the number of sentences passed, so we respect
        # that here by taking the batch size which we use to be the number of sentences
        # we are given.
        batch_size = len(inputs)
        instances_per_sentence = [
            self._sentence_to_srl_instances(json) for json in inputs
        ]

        flattened_instances = [
            instance for sentence_instances in instances_per_sentence
            for instance in sentence_instances
        ]

        if not flattened_instances:
            return sanitize([{
                "verbs": [],
                "words": self._tokenizer.tokenize(x["sentence"])
            } for x in inputs])

        # Make the instances into batches and check the last batch for
        # padded elements as the number of instances might not be perfectly
        # divisible by the batch size.
        batched_instances = group_by_count(flattened_instances, batch_size,
                                           None)
        batched_instances[-1] = [
            instance for instance in batched_instances[-1]
            if instance is not None
        ]
        # Run the model on the batches.
        outputs: List[Dict[str, numpy.ndarray]] = []
        for batch in batched_instances:
            outputs.extend(self._model.forward_on_instances(batch))

        verbs_per_sentence = [len(sent) for sent in instances_per_sentence]
        return_dicts: List[JsonDict] = [{"verbs": []} for x in inputs]

        output_index = 0
        for sentence_index, verb_count in enumerate(verbs_per_sentence):
            if verb_count == 0:
                # We didn't run any predictions for sentences with no verbs,
                # so we don't have a way to extract the original sentence.
                # Here we just tokenize the input again.
                original_text = self._tokenizer.tokenize(
                    inputs[sentence_index]["sentence"])
                return_dicts[sentence_index]["words"] = original_text
                continue

            for _ in range(verb_count):
                output = outputs[output_index]
                words = output["words"]
                tags = output["tags"]
                description = self.make_srl_string(words, tags)
                return_dicts[sentence_index]["words"] = words
                return_dicts[sentence_index]["verbs"].append({
                    "verb":
                    output["verb"],
                    "description":
                    description,
                    "tags":
                    tags
                })
                output_index += 1

        return sanitize(return_dicts)

    def predict_instances(self, instances: List[Instance]) -> JsonDict:
        outputs = self._model.forward_on_instances(instances)

        results = {"verbs": [], "words": outputs[0]["words"]}
        for output in outputs:
            tags = output["tags"]
            description = self.make_srl_string(output["words"], tags)
            results["verbs"].append({
                "verb": output["verb"],
                "description": description,
                "tags": tags
            })

        return sanitize(results)

    @overrides
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        """
        Expects JSON that looks like `{"sentence": "..."}`
        and returns JSON that looks like

        ```
        {"words": [...],
         "verbs": [
            {"verb": "...", "description": "...", "tags": [...]},
            ...
            {"verb": "...", "description": "...", "tags": [...]},
        ]}
        ```
        """
        instances = self._sentence_to_srl_instances(inputs)

        if not instances:
            return sanitize({
                "verbs": [],
                "words":
                self._tokenizer.tokenize(inputs["sentence"])
            })

        return self.predict_instances(instances)
Exemple #27
0
}

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
frames_i2w = ["Economic", "Capacity-and-resrouces", "Morality", "Fairness-and-equality",
              "Legality,-constitutionality-and-jurisprudence", "Policy-prescription-and-evaluation",
              "Crime-and-punishment", "Security-and-defense", "Health-and-safety", "Quality-of-life",
              "Cultural-identity", "Public-opinion", "Political", "External-regulation-and-reputation", "Other"]
MAX_EPOCHS = 100

nlp = spacy.load("en_core_web_sm")
nlp_lg = spacy.load("en_core_web_lg")
samesex_6_balanced_label_mapping = {10: 0, 11: 1, 12: 2, 3: 3, 5: 4, 13: 5}
spacy_tokenizer = SpacyTokenizer(language="en_core_web_sm", pos_tags=True)


# e.g. tokens = spacy_tokenizer.tokenize(sentence)


def deserialize_from_file(filename="data.json"):
    with open(filename, "rb") as read_file:
        data = json.load(read_file)
        return data


def serialize_to_file(filename="data.json", data=None):
    with open(filename, "w") as write_file:
        json.dump(data, write_file)
class TestSpacyTokenizer(AllenNlpTestCase):
    def setUp(self):
        super().setUp()
        self.word_tokenizer = SpacyTokenizer()

    def test_tokenize_handles_complex_punctuation(self):
        sentence = "this (sentence) has 'crazy' \"punctuation\"."
        expected_tokens = [
            "this",
            "(",
            "sentence",
            ")",
            "has",
            "'",
            "crazy",
            "'",
            '"',
            "punctuation",
            '"',
            ".",
        ]
        tokens = self.word_tokenizer.tokenize(sentence)
        token_text = [t.text for t in tokens]
        assert token_text == expected_tokens
        for token in tokens:
            start = token.idx
            end = start + len(token.text)
            assert sentence[start:end] == token.text

    def test_tokenize_handles_contraction(self):
        # note that "would've" is kept together, while "ain't" is not.
        sentence = "it ain't joe's problem; would been yesterday"
        expected_tokens = [
            "it",
            "ai",
            "n't",
            "joe",
            "'s",
            "problem",
            ";",
            "would",
            "been",
            "yesterday",
        ]
        tokens = [t.text for t in self.word_tokenizer.tokenize(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_multiple_contraction(self):
        sentence = "wouldn't've"
        expected_tokens = ["would", "n't", "'ve"]
        tokens = [t.text for t in self.word_tokenizer.tokenize(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_final_apostrophe(self):
        sentence = "the jones' house"
        expected_tokens = ["the", "jones", "'", "house"]
        tokens = [t.text for t in self.word_tokenizer.tokenize(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_removes_whitespace_tokens(self):
        sentence = "the\n jones'   house  \x0b  55"
        expected_tokens = ["the", "jones", "'", "house", "55"]
        tokens = [t.text for t in self.word_tokenizer.tokenize(sentence)]
        assert tokens == expected_tokens

    def test_tokenize_handles_special_cases(self):
        # note that the etc. doesn't quite work --- we can special case this if we want.
        sentence = "Mr. and Mrs. Jones, etc., went to, e.g., the store"
        expected_tokens = [
            "Mr.",
            "and",
            "Mrs.",
            "Jones",
            ",",
            "etc",
            ".",
            ",",
            "went",
            "to",
            ",",
            "e.g.",
            ",",
            "the",
            "store",
        ]
        tokens = [t.text for t in self.word_tokenizer.tokenize(sentence)]
        assert tokens == expected_tokens

    def test_batch_tokenization(self):
        sentences = [
            "This is     a sentence",
            "This isn't a sentence.",
            "This is the 3rd     sentence." "Here's the 'fourth' sentence.",
        ]
        batch_split = self.word_tokenizer.batch_tokenize(sentences)
        separately_split = [self.word_tokenizer.tokenize(sentence) for sentence in sentences]
        assert len(batch_split) == len(separately_split)
        for batch_sentence, separate_sentence in zip(batch_split, separately_split):
            assert len(batch_sentence) == len(separate_sentence)
            for batch_word, separate_word in zip(batch_sentence, separate_sentence):
                assert batch_word.text == separate_word.text

    def test_keep_spacy_tokens(self):
        word_tokenizer = SpacyTokenizer()
        sentence = "This should be an allennlp Token"
        tokens = word_tokenizer.tokenize(sentence)
        assert tokens
        assert all(isinstance(token, Token) for token in tokens)

        word_tokenizer = SpacyTokenizer(keep_spacy_tokens=True)
        sentence = "This should be a spacy Token"
        tokens = word_tokenizer.tokenize(sentence)
        assert tokens
        assert all(isinstance(token, spacy.tokens.Token) for token in tokens)
 def setUp(self):
     super().setUp()
     self.word_tokenizer = SpacyTokenizer()