def test_whitespace_does_not_throw_error():
    texts = rasa.shared.utils.io.read_json_file(
        "data/test_tokenizers/naughty_strings.json")

    tk = WhitespaceTokenizer()

    for text in texts:
        tk.tokenize(Message.build(text=text), attribute=TEXT)
Example #2
0
def test_whitespace_does_not_throw_error():
    import rasa.utils.io as io_utils

    texts = io_utils.read_json_file(
        "data/test_tokenizers/naughty_strings.json")

    tk = WhitespaceTokenizer()

    for text in texts:
        tk.tokenize(Message(text), attribute=TEXT)
def test_whitespace_cls_token():
    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer

    component_config = {"use_cls_token": True}

    tk = WhitespaceTokenizer(component_config)

    assert [t.text for t in tk.tokenize("Forecast for lunch")] == [
        "Forecast",
        "for",
        "lunch",
        CLS_TOKEN,
    ]
    assert [t.offset
            for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13, 19]
Example #4
0
def test_repeated_entities(tmp_path: Path,
                           whitespace_tokenizer: WhitespaceTokenizer):
    data = """
{
  "rasa_nlu_data": {
    "common_examples" : [
      {
        "text": "book a table today from 3 to 6 for 3 people",
        "intent": "unk",
        "entities": [
          {
            "entity": "description",
            "start": 35,
            "end": 36,
            "value": "3"
          }
        ]
      }
    ]
  }
}"""
    f = tmp_path / "tmp_training_data.json"
    f.write_text(data, rasa.shared.utils.io.DEFAULT_ENCODING)
    td = load_data(str(f))
    assert len(td.entity_examples) == 1
    example = td.entity_examples[0]
    entities = example.get("entities")
    assert len(entities) == 1
    tokens = whitespace_tokenizer.tokenize(example, attribute=TEXT)
    start, end = MitieEntityExtractor.find_entity(entities[0],
                                                  example.get(TEXT), tokens)
    assert start == 9
    assert end == 10
Example #5
0
def test_multiword_entities(tmp_path: Path,
                            whitespace_tokenizer: WhitespaceTokenizer):
    data = """
{
  "rasa_nlu_data": {
    "common_examples" : [
      {
        "text": "show me flights to New York City",
        "intent": "unk",
        "entities": [
          {
            "entity": "destination",
            "start": 19,
            "end": 32,
            "value": "New York City"
          }
        ]
      }
    ]
  }
}"""
    f = tmp_path / "tmp_training_data.json"
    f.write_text(data, rasa.shared.utils.io.DEFAULT_ENCODING)
    td = load_data(str(f))
    assert len(td.entity_examples) == 1
    example = td.entity_examples[0]
    entities = example.get("entities")
    assert len(entities) == 1
    tokens = whitespace_tokenizer.tokenize(example, attribute=TEXT)
    start, end = MitieEntityExtractor.find_entity(entities[0],
                                                  example.get(TEXT), tokens)
    assert start == 4
    assert end == 7
def test_lm_featurizer_edge_cases(model_name, model_weights, texts,
                                  expected_tokens, expected_indices):

    if model_weights is None:
        model_weights_config = {}
    else:
        model_weights_config = {"model_weights": model_weights}
    transformers_config = {
        **{
            "model_name": model_name
        },
        **model_weights_config
    }

    lm_featurizer = LanguageModelFeaturizer(transformers_config)
    whitespace_tokenizer = WhitespaceTokenizer()

    for text, gt_tokens, gt_indices in zip(texts, expected_tokens,
                                           expected_indices):

        message = Message.build(text=text)
        tokens = whitespace_tokenizer.tokenize(message, TEXT)
        message.set(TOKENS_NAMES[TEXT], tokens)
        lm_featurizer.process(message)

        assert [t.text for t in tokens] == gt_tokens
        assert [t.start for t in tokens] == [i[0] for i in gt_indices]
        assert [t.end for t in tokens] == [i[1] for i in gt_indices]
Example #7
0
async def prepare_token_serialisation(
    tracker_store: TrackerStore, response_selector_agent: Agent, sender_id: Text
):
    text = "Good morning"
    tokenizer = WhitespaceTokenizer(WhitespaceTokenizer.get_default_config())
    tokens = tokenizer.tokenize(Message(data={"text": text}), "text")
    indices = [[t.start, t.end] for t in tokens]

    tracker = tracker_store.get_or_create_tracker(sender_id=sender_id)
    parse_data = await response_selector_agent.parse_message(text)
    event = UserUttered(
        "Good morning",
        parse_data.get("intent"),
        parse_data.get("entities", []),
        parse_data,
    )

    tracker.update(event)
    tracker_store.save(tracker)

    retrieved_tracker = tracker_store.retrieve(sender_id=sender_id)
    event = retrieved_tracker.get_last_event_for(event_type=UserUttered)
    event_tokens = event.as_dict().get("parse_data").get("text_tokens")

    assert event_tokens == indices
Example #8
0
def test_whitespace_with_case(text, component_config, expected_tokens):

    tk = WhitespaceTokenizer(component_config)

    message = Message(text)

    tokens = tk.tokenize(message, attribute=TEXT)

    assert [t.text for t in tokens] == expected_tokens
def test_whitespace(text, expected_tokens, expected_indices):

    tk = WhitespaceTokenizer()

    tokens = tk.tokenize(Message(text), attribute=TEXT)

    assert [t.text for t in tokens] == expected_tokens
    assert [t.start for t in tokens] == [i[0] for i in expected_indices]
    assert [t.end for t in tokens] == [i[1] for i in expected_indices]
class IncrementalWhitespaceTokenizer(Tokenizer, IncrementalComponent):

    provides = ["tokens"]

    def __init__(self,
                 component_config: Optional[Dict[Text, Any]]=None):

        super(IncrementalWhitespaceTokenizer, self).__init__(component_config)
        self.offset = 0
        self.tokens = []
        self.WST = WhitespaceTokenizer()

    def new_utterance(self):
        self.offset = 0
        self.tokens = []

    def train(self, training_data: TrainingData, config: RasaNLUModelConfig,
              **kwargs: Any) -> None:

        for example in training_data.training_examples:
            example.set("tokens", self.WST.tokenize(text=example.text))

    def process(self, message: Message, **kwargs: Any) -> None:
        iu_list = message.get('iu_list')
        last_iu = iu_list[-1]
        iu_word, iu_type = last_iu
        if iu_type == "add":
            token = self.WST.tokenize(iu_word)
            if token:
                token = token[0]
                token.offset = self.offset
                token.end = token.offset + token.end
                self.offset += (token.end - token.offset + 1)
                self.tokens.append(token)
        elif iu_type == "revoke":
            if len(self.tokens) > 0:
                removed = self.tokens.pop()
                self.offset = removed.offset
        else:
            logger.error("incompatible iu type, expected 'add' or 'revoke',"
                         " got '" + iu_type + "'")
        message.set("tokens", self.tokens)
Example #11
0
def test_whitespace_custom_intent_symbol():
    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer

    component_config = {
        "intent_tokenization_flag": True,
        "intent_split_symbol": "+"
    }

    tk = WhitespaceTokenizer(component_config)

    assert [
        t.text for t in tk.tokenize("Forecast_for_LUNCH", attribute="intent")
    ] == ["Forecast_for_LUNCH"]

    assert [
        t.text for t in tk.tokenize("Forecast+for+LUNCH", attribute="intent")
    ] == [
        "Forecast",
        "for",
        "LUNCH",
    ]
Example #12
0
 def check_subtokens(
     texts: List[Text],
     messages: List[Message],
     expected_number_of_sub_tokens: List[List[float]],
 ):
     whitespace_tokenizer = WhitespaceTokenizer()
     for index, message in enumerate(messages):
         assert [
             t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT])
         ] == expected_number_of_sub_tokens[index]
         assert len(message.get(TOKENS_NAMES[TEXT])) == len(
             whitespace_tokenizer.tokenize(Message.build(text=texts[index]), TEXT)
         )
Example #13
0
async def test_interpreter_parses_text_tokens(
    response_selector_interpreter: Interpreter,
):
    text = "Hello there"
    tokenizer = WhitespaceTokenizer()
    tokens = tokenizer.tokenize(Message(data={"text": text}), "text")
    indices = [(t.start, t.end) for t in tokens]

    parsed_data = response_selector_interpreter.parse(text)
    assert "text_tokens" in parsed_data.keys()

    parsed_tokens = parsed_data.get("text_tokens")

    assert parsed_tokens == indices
Example #14
0
def test_whitespace():
    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
    tk = WhitespaceTokenizer()

    assert ([t.text for t in tk.tokenize("Forecast for lunch")
             ] == ['Forecast', 'for', 'lunch'])

    assert ([t.offset
             for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13])

    # we ignore .,!?
    assert ([t.text for t in tk.tokenize("hey ńöñàśçií how're you?")
             ] == ['hey', 'ńöñàśçií', 'how', 're', 'you'])

    assert ([t.offset for t in tk.tokenize("hey ńöñàśçií how're you?")
             ] == [0, 4, 13, 17, 20])

    assert ([
        t.text for t in tk.tokenize("привет! 10.000, ńöñàśçií. "
                                    "(how're you?)")
    ] == ['привет', '10.000', 'ńöñàśçií', 'how', 're', 'you'])

    assert ([
        t.offset for t in tk.tokenize("привет! 10.000, ńöñàśçií. "
                                      "(how're you?)")
    ] == [0, 8, 16, 27, 31, 34])

    # urls are single token
    assert ([
        t.text for t in tk.tokenize("https://www.google.com/search?client="
                                    "safari&rls=en&q="
                                    "i+like+rasa&ie=UTF-8&oe=UTF-8 "
                                    "https://rasa.com/docs/nlu/"
                                    "components/#tokenizer-whitespace")
    ] == [
        "https://www.google.com/search?"
        "client=safari&rls=en&q=i+like+rasa&ie=UTF-8&oe=UTF-8",
        "https://rasa.com/docs/nlu/components/#tokenizer-whitespace"
    ])

    assert ([
        t.offset for t in tk.tokenize("https://www.google.com/search?client="
                                      "safari&rls=en&q="
                                      "i+like+rasa&ie=UTF-8&oe=UTF-8 "
                                      "https://rasa.com/docs/nlu/"
                                      "components/#tokenizer-whitespace")
    ] == [0, 83])
Example #15
0
def test_convert_tags_to_entities(
    text: Text,
    tags: Dict[Text, List[Text]],
    confidences: Dict[Text, List[float]],
    expected_entities: List[Dict[Text, Any]],
):
    extractor = EntityExtractor()
    tokenizer = WhitespaceTokenizer()

    message = Message(text)
    tokens = tokenizer.tokenize(message, TEXT)

    actual_entities = extractor.convert_predictions_into_entities(
        text, tokens, tags, confidences)
    assert actual_entities == expected_entities
Example #16
0
def test_convert_tags_to_entities(
    text: Text,
    tags: Dict[Text, List[Text]],
    confidences: Dict[Text, List[float]],
    expected_entities: List[Dict[Text, Any]],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    extractor = EntityExtractorMixin()

    message = Message(data={TEXT: text})
    tokens = whitespace_tokenizer.tokenize(message, TEXT)

    split_entities_config = {SPLIT_ENTITIES_BY_COMMA: True}
    actual_entities = extractor.convert_predictions_into_entities(
        text, tokens, tags, split_entities_config, confidences)
    assert actual_entities == expected_entities
Example #17
0
def test_split_entities_by_comma(
    text: Text,
    tags: Dict[Text, List[Text]],
    confidences: Dict[Text, List[float]],
    expected_entities: List[Dict[Text, Any]],
):
    extractor = EntityExtractor()
    tokenizer = WhitespaceTokenizer()

    message = Message(data={TEXT: text})
    tokens = tokenizer.tokenize(message, TEXT)

    split_entities_config = {
        SPLIT_ENTITIES_BY_COMMA: True,
        "address": False,
        "ingredient": True,
    }
    actual_entities = extractor.convert_predictions_into_entities(
        text, tokens, tags, split_entities_config, confidences)

    assert actual_entities == expected_entities
Example #18
0
async def test_processor_logs_text_tokens_in_tracker(mood_agent: Agent):
    text = "Hello there"
    tokenizer = WhitespaceTokenizer()
    tokens = tokenizer.tokenize(Message(data={"text": text}), "text")
    indices = [(t.start, t.end) for t in tokens]

    message = UserMessage(text)
    tracker_store = InMemoryTrackerStore(mood_agent.domain)
    lock_store = InMemoryLockStore()
    processor = MessageProcessor(
        mood_agent.interpreter,
        mood_agent.policy_ensemble,
        mood_agent.domain,
        tracker_store,
        lock_store,
        TemplatedNaturalLanguageGenerator(mood_agent.domain.responses),
    )
    tracker = await processor.log_message(message)
    event = tracker.get_last_event_for(event_type=UserUttered)
    event_tokens = event.as_dict().get("parse_data").get("text_tokens")

    assert event_tokens == indices
Example #19
0
def test_whitespace():
    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
    tk = WhitespaceTokenizer()

    assert ([t.text for t in tk.tokenize("Forecast for lunch")] ==
            ['Forecast', 'for', 'lunch'])

    assert ([t.offset for t in tk.tokenize("Forecast for lunch")] ==
            [0, 9, 13])

    # we ignore .,!?
    assert ([t.text for t in tk.tokenize("hey ńöñàśçií how're you?")] ==
            ['hey', 'ńöñàśçií', 'how\'re', 'you'])

    assert ([t.offset for t in tk.tokenize("hey ńöñàśçií how're you?")] ==
            [0, 4, 13, 20])

    assert ([t.text
             for t in tk.tokenize("привет! 10.000, ńöñàśçií. how're you?")] ==
            ['привет', '10.000', 'ńöñàśçií', 'how\'re', 'you'])

    assert ([t.offset
             for t in tk.tokenize("привет! 10.000, ńöñàśçií. how're you?")] ==
            [0, 8, 16, 26, 33])
Example #20
0
def test_whitespace_with_case():
    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer

    component_config = {"case_sensitive": False}
    tk = WhitespaceTokenizer(component_config)
    assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
        "forecast",
        "for",
        "lunch",
    ]

    component_config = {"case_sensitive": True}
    tk = WhitespaceTokenizer(component_config)
    assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
        "Forecast",
        "for",
        "LUNCH",
    ]

    component_config = {}
    tk = WhitespaceTokenizer(component_config)
    assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
        "Forecast",
        "for",
        "LUNCH",
    ]

    component_config = {"case_sensitive": False}
    tk = WhitespaceTokenizer(component_config)
    message = Message("Forecast for LUNCH")
    tk.process(message)
    assert message.data.get("tokens")[0].text == "forecast"
    assert message.data.get("tokens")[1].text == "for"
    assert message.data.get("tokens")[2].text == "lunch"

    _config = utilities.base_test_conf("supervised_embeddings")
    examples = [
        Message(
            "Any Mexican restaurant will do",
            {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 4,
                    "end": 11,
                    "value": "Mexican",
                    "entity": "cuisine"
                }],
            },
        ),
        Message(
            "I want Tacos!",
            {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 7,
                    "end": 12,
                    "value": "Mexican",
                    "entity": "cuisine"
                }],
            },
        ),
    ]

    component_config = {"case_sensitive": False}
    tk = WhitespaceTokenizer(component_config)
    tk.train(TrainingData(training_examples=examples), _config)
    assert examples[0].data.get("tokens")[0].text == "any"
    assert examples[0].data.get("tokens")[1].text == "mexican"
    assert examples[0].data.get("tokens")[2].text == "restaurant"
    assert examples[0].data.get("tokens")[3].text == "will"
    assert examples[0].data.get("tokens")[4].text == "do"
    assert examples[1].data.get("tokens")[0].text == "i"
    assert examples[1].data.get("tokens")[1].text == "want"
    assert examples[1].data.get("tokens")[2].text == "tacos"
Example #21
0
def test_whitespace():
    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer

    tk = WhitespaceTokenizer()

    assert [t.text for t in tk.tokenize("Forecast for lunch")] == [
        "Forecast",
        "for",
        "lunch",
    ]

    assert [t.offset for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13]

    # we ignore .,!?
    assert [t.text for t in tk.tokenize("hey ńöñàśçií how're you?")] == [
        "hey",
        "ńöñàśçií",
        "how",
        "re",
        "you",
    ]

    assert [t.offset for t in tk.tokenize("hey ńöñàśçií how're you?")] == [
        0,
        4,
        13,
        17,
        20,
    ]

    assert [
        t.text for t in tk.tokenize("привет! 10.000, ńöñàśçií. (how're you?)")
    ] == [
        "привет",
        "10.000",
        "ńöñàśçií",
        "how",
        "re",
        "you",
    ]

    assert [
        t.offset
        for t in tk.tokenize("привет! 10.000, ńöñàśçií. (how're you?)")
    ] == [0, 8, 16, 27, 31, 34]

    # urls are single token
    assert [
        t.text for t in tk.tokenize("https://www.google.com/search?client="
                                    "safari&rls=en&q="
                                    "i+like+rasa&ie=UTF-8&oe=UTF-8 "
                                    "https://rasa.com/docs/nlu/"
                                    "components/#tokenizer-whitespace")
    ] == [
        "https://www.google.com/search?"
        "client=safari&rls=en&q=i+like+rasa&ie=UTF-8&oe=UTF-8",
        "https://rasa.com/docs/nlu/components/#tokenizer-whitespace",
    ]

    assert [
        t.offset for t in tk.tokenize("https://www.google.com/search?client="
                                      "safari&rls=en&q="
                                      "i+like+rasa&ie=UTF-8&oe=UTF-8 "
                                      "https://rasa.com/docs/nlu/"
                                      "components/#tokenizer-whitespace")
    ] == [0, 83]
Example #22
0
def test_crf_create_entity_dict(spacy_nlp):
    crf_extractor = CRFEntityExtractor()
    spacy_tokenizer = SpacyTokenizer()
    white_space_tokenizer = WhitespaceTokenizer()

    examples = [
        {
            "message": Message(
                "where is St. Michael's Hospital?",
                {
                    "intent": "search_location",
                    "entities": [
                        {
                            "start": 9,
                            "end": 31,
                            "value": "St. Michael's Hospital",
                            "entity": "hospital",
                            "SpacyTokenizer": {
                                "entity_start_token_idx": 2,
                                "entity_end_token_idx": 5,
                            },
                            "WhitespaceTokenizer": {
                                "entity_start_token_idx": 2,
                                "entity_end_token_idx": 5,
                            },
                        }
                    ],
                    SPACY_DOCS[TEXT]: spacy_nlp("where is St. Michael's Hospital?"),
                },
            )
        },
        {
            "message": Message(
                "where is Children's Hospital?",
                {
                    "intent": "search_location",
                    "entities": [
                        {
                            "start": 9,
                            "end": 28,
                            "value": "Children's Hospital",
                            "entity": "hospital",
                            "SpacyTokenizer": {
                                "entity_start_token_idx": 2,
                                "entity_end_token_idx": 4,
                            },
                            "WhitespaceTokenizer": {
                                "entity_start_token_idx": 2,
                                "entity_end_token_idx": 4,
                            },
                        }
                    ],
                    SPACY_DOCS[TEXT]: spacy_nlp("where is Children's Hospital?"),
                },
            )
        },
    ]
    for ex in examples:
        # spacy tokenizers receives a Doc as input and whitespace tokenizer receives a text
        spacy_tokens = spacy_tokenizer.tokenize(ex["message"], TEXT)
        white_space_tokens = white_space_tokenizer.tokenize(ex["message"], TEXT)
        for tokenizer, tokens in [
            ("SpacyTokenizer", spacy_tokens),
            ("WhitespaceTokenizer", white_space_tokens),
        ]:
            for entity in ex["message"].get("entities"):
                parsed_entities = crf_extractor._create_entity_dict(
                    ex["message"],
                    tokens,
                    entity[tokenizer]["entity_start_token_idx"],
                    entity[tokenizer]["entity_end_token_idx"],
                    entity["entity"],
                    0.8,
                )
                assert parsed_entities == {
                    "start": entity["start"],
                    "end": entity["end"],
                    "value": entity["value"],
                    "entity": entity["entity"],
                    "confidence": 0.8,
                }
Example #23
0
class HFTransformersNLPCustom(HFTransformersNLP):
    """Utility Component for interfacing between Transformers library and Rasa OS.
    The transformers(https://github.com/huggingface/transformers) library
    is used to load pre-trained language models like BERT, GPT-2, etc.
    The component also tokenizes and featurizes dense featurizable attributes of each
    message.
    """
    def __init__(self,
                 component_config: Optional[Dict[Text, Any]] = None) -> None:
        super(HFTransformersNLP, self).__init__(component_config)

        self._load_model()
        self.whitespace_tokenizer = WhitespaceTokenizer()

    def _load_model(self) -> None:
        """Try loading the model"""

        from bothub.shared.utils.rasa_components.registry import (
            model_class_dict,
            model_weights_defaults,
            model_tokenizer_dict,
            from_pt_dict,
        )

        self.model_name = self.component_config["model_name"]

        if self.model_name not in model_class_dict:
            raise KeyError(
                f"'{self.model_name}' not a valid model name. Choose from "
                f"{str(list(model_class_dict.keys()))}or create"
                f"a new class inheriting from this class to support your model."
            )

        self.model_weights = self.component_config["model_weights"]
        self.cache_dir = self.component_config["cache_dir"]

        if not self.model_weights:
            logger.info(
                f"Model weights not specified. Will choose default model weights: "
                f"{model_weights_defaults[self.model_name]}")
            self.model_weights = model_weights_defaults[self.model_name]

        logger.debug(f"Loading Tokenizer and Model for {self.model_name}")

        try:
            from bothub_nlp_celery.app import nlp_language

            self.tokenizer, self.model = nlp_language
        except TypeError:
            logger.info(f"Model could not be retrieved from celery cache "
                        f"Loading model {self.model_name} in memory")
            self.tokenizer = model_tokenizer_dict[
                self.model_name].from_pretrained(
                    model_weights_defaults[self.model_name], cache_dir=None)
            self.model = model_class_dict[self.model_name].from_pretrained(
                self.model_name,
                cache_dir=None,
                from_pt=from_pt_dict.get(self.model_name, False),
            )

        # Use a universal pad token since all transformer architectures do not have a
        # consistent token. Instead of pad_token_id we use unk_token_id because
        # pad_token_id is not set for all architectures. We can't add a new token as
        # well since vocabulary resizing is not yet supported for TF classes.
        # Also, this does not hurt the model predictions since we use an attention mask
        # while feeding input.
        self.pad_token_id = self.tokenizer.unk_token_id
        logger.debug(f"Loaded Tokenizer and Model for {self.model_name}")

    def _add_lm_specific_special_tokens(
            self, token_ids: List[List[int]]) -> List[List[int]]:
        """Add language model specific special tokens which were used during their training.
        Args:
            token_ids: List of token ids for each example in the batch.
        Returns:
            Augmented list of token ids for each example in the batch.
        """
        from bothub.shared.utils.rasa_components.registry import (
            model_special_tokens_pre_processors, )

        augmented_tokens = [
            model_special_tokens_pre_processors[self.model_name](
                example_token_ids) for example_token_ids in token_ids
        ]
        return augmented_tokens

    def _lm_specific_token_cleanup(
            self, split_token_ids: List[int],
            token_strings: List[Text]) -> Tuple[List[int], List[Text]]:
        """Clean up special chars added by tokenizers of language models.
        Many language models add a special char in front/back of (some) words. We clean up those chars as they are not
        needed once the features are already computed.
        Args:
            split_token_ids: List of token ids received as output from the language model specific tokenizer.
            token_strings: List of token strings received as output from the language model specific tokenizer.
        Returns:
            Cleaned up token ids and token strings.
        """
        from bothub.shared.utils.rasa_components.registry import model_tokens_cleaners

        return model_tokens_cleaners[self.model_name](split_token_ids,
                                                      token_strings)

    def _post_process_sequence_embeddings(
            self,
            sequence_embeddings: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """Compute sentence level representations and sequence level representations for relevant tokens.
        Args:
            sequence_embeddings: Sequence level dense features received as output from language model.
        Returns:
            Sentence and sequence level representations.
        """

        from bothub.shared.utils.rasa_components.registry import (
            model_embeddings_post_processors, )

        sentence_embeddings = []
        post_processed_sequence_embeddings = []

        for example_embedding in sequence_embeddings:
            (
                example_sentence_embedding,
                example_post_processed_embedding,
            ) = model_embeddings_post_processors[self.model_name](
                example_embedding)

            sentence_embeddings.append(example_sentence_embedding)
            post_processed_sequence_embeddings.append(
                example_post_processed_embedding)

        return (
            np.array(sentence_embeddings),
            np.array(post_processed_sequence_embeddings),
        )

    def _tokenize_example(
            self,
            message: Message,
            attribute: Text,
            model_size: int = 384) -> Tuple[List[Token], List[int]]:
        """Tokenize a single message example.

        Many language models add a special char in front of (some) words and split words into
        sub-words. To ensure the entity start and end values matches the token values,
        tokenize the text first using the whitespace tokenizer. If individual tokens
        are split up into multiple tokens, we make sure that the start and end value
        of the first and last respective tokens stay the same.

        Args:
            message: Single message object to be processed.
            attribute: Property of message to be processed, one of ``TEXT`` or ``RESPONSE``.
            model_size: Limit of tokens the model can handle (BERT = 512)

        Returns:
            List of token strings and token ids for the corresponding attribute of the message.
        """

        tokens_in = self.whitespace_tokenizer.tokenize(message, attribute)

        tokens_out = []

        token_ids_out = []

        for token in tokens_in:
            # use lm specific tokenizer to further tokenize the text
            split_token_ids, split_token_strings = self._lm_tokenize(
                token.text)

            split_token_ids, split_token_strings = self._lm_specific_token_cleanup(
                split_token_ids, split_token_strings)

            if len(tokens_out) + len(split_token_strings) >= model_size:
                logger.warning(
                    f"Sentence number of tokens overflowing model size. Skipping sentence exceeded tokens... "
                    f"Sentence text: '{message.text[:50]} ...' ")
                break

            token_ids_out += split_token_ids

            tokens_out += train_utils.align_tokens(split_token_strings,
                                                   token.end, token.start)

        return tokens_out, token_ids_out
Example #24
0
class HFTransformersNLP(Component):
    """Utility Component for interfacing between Transformers library and Rasa OS.

    The transformers(https://github.com/huggingface/transformers) library
    is used to load pre-trained language models like BERT, GPT-2, etc.
    The component also tokenizes and featurizes dense featurizable attributes of each
    message.
    """

    defaults = {
        # name of the language model to load.
        "model_name": "bert",
        # Pre-Trained weights to be loaded(string)
        "model_weights": None,
        # an optional path to a specific directory to download
        # and cache the pre-trained model weights.
        "cache_dir": None,
    }

    def __init__(
        self,
        component_config: Optional[Dict[Text, Any]] = None,
        skip_model_load: bool = False,
    ) -> None:
        super(HFTransformersNLP, self).__init__(component_config)

        self._load_model_metadata()
        self._load_model_instance(skip_model_load)
        self.whitespace_tokenizer = WhitespaceTokenizer()

    def _load_model_metadata(self) -> None:

        from rasa.nlu.utils.hugging_face.registry import (
            model_class_dict,
            model_weights_defaults,
        )

        self.model_name = self.component_config["model_name"]

        if self.model_name not in model_class_dict:
            raise KeyError(
                f"'{self.model_name}' not a valid model name. Choose from "
                f"{str(list(model_class_dict.keys()))} or create"
                f"a new class inheriting from this class to support your model."
            )

        self.model_weights = self.component_config["model_weights"]
        self.cache_dir = self.component_config["cache_dir"]

        if not self.model_weights:
            logger.info(
                f"Model weights not specified. Will choose default model weights: "
                f"{model_weights_defaults[self.model_name]}")
            self.model_weights = model_weights_defaults[self.model_name]

        self.max_model_sequence_length = MAX_SEQUENCE_LENGTHS[self.model_name]

    def _load_model_instance(self, skip_model_load: bool) -> None:
        """Try loading the model instance

        Args:
            skip_model_load: Skip loading the model instances to save time. This should be True only for pytests
        """

        if skip_model_load:
            # This should be True only during pytests
            return

        from rasa.nlu.utils.hugging_face.registry import (
            model_class_dict,
            model_tokenizer_dict,
        )

        logger.debug(f"Loading Tokenizer and Model for {self.model_name}")

        self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained(
            self.model_weights, cache_dir=self.cache_dir)
        self.model = model_class_dict[self.model_name].from_pretrained(
            self.model_weights, cache_dir=self.cache_dir)

        # Use a universal pad token since all transformer architectures do not have a
        # consistent token. Instead of pad_token_id we use unk_token_id because
        # pad_token_id is not set for all architectures. We can't add a new token as
        # well since vocabulary resizing is not yet supported for TF classes.
        # Also, this does not hurt the model predictions since we use an attention mask
        # while feeding input.
        self.pad_token_id = self.tokenizer.unk_token_id

    @classmethod
    def cache_key(cls, component_meta: Dict[Text, Any],
                  model_metadata: Metadata) -> Optional[Text]:

        weights = component_meta.get("model_weights") or {}

        return f"{cls.name}-{component_meta.get('model_name')}-{get_dict_hash(weights)}"

    @classmethod
    def required_packages(cls) -> List[Text]:
        return ["transformers"]

    def _lm_tokenize(self, text: Text) -> Tuple[List[int], List[Text]]:
        """Pass the text through the tokenizer of the language model.

        Args:
            text: Text to be tokenized.

        Returns:
            List of token ids and token strings.

        """
        split_token_ids = self.tokenizer.encode(text, add_special_tokens=False)

        split_token_strings = self.tokenizer.convert_ids_to_tokens(
            split_token_ids)

        return split_token_ids, split_token_strings

    def _add_lm_specific_special_tokens(
            self, token_ids: List[List[int]]) -> List[List[int]]:
        """Add language model specific special tokens which were used during their training.

        Args:
            token_ids: List of token ids for each example in the batch.

        Returns:
            Augmented list of token ids for each example in the batch.
        """
        from rasa.nlu.utils.hugging_face.registry import (
            model_special_tokens_pre_processors, )

        augmented_tokens = [
            model_special_tokens_pre_processors[self.model_name](
                example_token_ids) for example_token_ids in token_ids
        ]
        return augmented_tokens

    def _lm_specific_token_cleanup(
            self, split_token_ids: List[int],
            token_strings: List[Text]) -> Tuple[List[int], List[Text]]:
        """Clean up special chars added by tokenizers of language models.

        Many language models add a special char in front/back of (some) words. We clean
        up those chars as they are not
        needed once the features are already computed.

        Args:
            split_token_ids: List of token ids received as output from the language
            model specific tokenizer.
            token_strings: List of token strings received as output from the language
            model specific tokenizer.

        Returns:
            Cleaned up token ids and token strings.
        """
        from rasa.nlu.utils.hugging_face.registry import model_tokens_cleaners

        return model_tokens_cleaners[self.model_name](split_token_ids,
                                                      token_strings)

    def _post_process_sequence_embeddings(
            self,
            sequence_embeddings: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """Compute sentence level representations and sequence level representations
        for relevant tokens.

        Args:
            sequence_embeddings: Sequence level dense features received as output from
            language model.

        Returns:
            Sentence and sequence level representations.
        """

        from rasa.nlu.utils.hugging_face.registry import (
            model_embeddings_post_processors, )

        sentence_embeddings = []
        post_processed_sequence_embeddings = []

        for example_embedding in sequence_embeddings:
            (
                example_sentence_embedding,
                example_post_processed_embedding,
            ) = model_embeddings_post_processors[self.model_name](
                example_embedding)

            sentence_embeddings.append(example_sentence_embedding)
            post_processed_sequence_embeddings.append(
                example_post_processed_embedding)

        return (
            np.array(sentence_embeddings),
            np.array(post_processed_sequence_embeddings),
        )

    def _tokenize_example(self, message: Message,
                          attribute: Text) -> Tuple[List[Token], List[int]]:
        """Tokenize a single message example.

        Many language models add a special char in front of (some) words and split
        words into sub-words. To ensure the entity start and end values matches the
        token values, tokenize the text first using the whitespace tokenizer. If
        individual tokens are split up into multiple tokens, we add this information
        to the respected token.

        Args:
            message: Single message object to be processed.
            attribute: Property of message to be processed, one of ``TEXT`` or
            ``RESPONSE``.

        Returns:
            List of token strings and token ids for the corresponding attribute of the
            message.
        """

        tokens_in = self.whitespace_tokenizer.tokenize(message, attribute)

        tokens_out = []

        token_ids_out = []

        for token in tokens_in:
            # use lm specific tokenizer to further tokenize the text
            split_token_ids, split_token_strings = self._lm_tokenize(
                token.text)

            split_token_ids, split_token_strings = self._lm_specific_token_cleanup(
                split_token_ids, split_token_strings)

            token_ids_out += split_token_ids

            token.set(NUMBER_OF_SUB_TOKENS, len(split_token_strings))

            tokens_out.append(token)

        return tokens_out, token_ids_out

    def _get_token_ids_for_batch(
            self, batch_examples: List[Message],
            attribute: Text) -> Tuple[List[List[Token]], List[List[int]]]:
        """Compute token ids and token strings for each example in batch.

        A token id is the id of that token in the vocabulary of the language model.
        Args:
            batch_examples: Batch of message objects for which tokens need to be
            computed.
            attribute: Property of message to be processed, one of ``TEXT`` or
            ``RESPONSE``.

        Returns:
            List of token strings and token ids for each example in the batch.
        """

        batch_token_ids = []
        batch_tokens = []
        for example in batch_examples:

            example_tokens, example_token_ids = self._tokenize_example(
                example, attribute)
            batch_tokens.append(example_tokens)
            batch_token_ids.append(example_token_ids)

        return batch_tokens, batch_token_ids

    @staticmethod
    def _compute_attention_mask(actual_sequence_lengths: List[int],
                                max_input_sequence_length: int) -> np.ndarray:
        """Compute a mask for padding tokens.

        This mask will be used by the language model so that it does not attend to
        padding tokens.

        Args:
            actual_sequence_lengths: List of length of each example without any padding.
            max_input_sequence_length: Maximum length of a sequence that will be present in the input batch. This is
            after taking into consideration the maximum input sequence the model can handle. Hence it can never be
            greater than self.max_model_sequence_length in case the model applies length restriction.

        Returns:
            Computed attention mask, 0 for padding and 1 for non-padding tokens.
        """

        attention_mask = []

        for actual_sequence_length in actual_sequence_lengths:
            # add 1s for present tokens, fill up the remaining space up to max
            # sequence length with 0s (non-existing tokens)
            padded_sequence = [1] * min(
                actual_sequence_length, max_input_sequence_length) + [0] * (
                    max_input_sequence_length -
                    min(actual_sequence_length, max_input_sequence_length))
            attention_mask.append(padded_sequence)

        attention_mask = np.array(attention_mask).astype(np.float32)
        return attention_mask

    def _extract_sequence_lengths(
            self, batch_token_ids: List[List[int]]) -> Tuple[List[int], int]:

        # Compute max length across examples
        max_input_sequence_length = 0
        actual_sequence_lengths = []

        for example_token_ids in batch_token_ids:
            sequence_length = len(example_token_ids)
            actual_sequence_lengths.append(sequence_length)
            max_input_sequence_length = max(max_input_sequence_length,
                                            len(example_token_ids))

        # Take into account the maximum sequence length the model can handle
        max_input_sequence_length = (
            max_input_sequence_length if
            self.max_model_sequence_length == NO_LENGTH_RESTRICTION else min(
                max_input_sequence_length, self.max_model_sequence_length))

        return actual_sequence_lengths, max_input_sequence_length

    def _add_padding_to_batch(
            self, batch_token_ids: List[List[int]],
            max_sequence_length_model: int) -> List[List[int]]:
        """Add padding so that all examples in the batch are of the same length.

        Args:
            batch_token_ids: Batch of examples where each example is a non-padded list
            of token ids.
            max_sequence_length_model: Maximum length of any input sequence in the batch
            to be fed to the model.

        Returns:
            Padded batch with all examples of the same length.
        """
        padded_token_ids = []

        # Add padding according to max_sequence_length
        # Some models don't contain pad token, we use unknown token as padding token.
        # This doesn't affect the computation since we compute an attention mask
        # anyways.
        for example_token_ids in batch_token_ids:

            # Truncate any longer sequences so that they can be fed to the model
            if len(example_token_ids) > max_sequence_length_model:
                example_token_ids = example_token_ids[:
                                                      max_sequence_length_model]

            padded_token_ids.append(
                example_token_ids + [self.pad_token_id] *
                (max_sequence_length_model - len(example_token_ids)))
        return padded_token_ids

    @staticmethod
    def _extract_nonpadded_embeddings(
            embeddings: np.ndarray,
            actual_sequence_lengths: List[int]) -> np.ndarray:
        """Use pre-computed non-padded lengths of each example to extract embeddings
        for non-padding tokens.

        Args:
            embeddings: sequence level representations for each example of the batch.
            actual_sequence_lengths: non-padded lengths of each example of the batch.

        Returns:
            Sequence level embeddings for only non-padding tokens of the batch.
        """
        nonpadded_sequence_embeddings = []
        for index, embedding in enumerate(embeddings):
            unmasked_embedding = embedding[:actual_sequence_lengths[index]]
            nonpadded_sequence_embeddings.append(unmasked_embedding)

        return np.array(nonpadded_sequence_embeddings)

    def _compute_batch_sequence_features(
            self, batch_attention_mask: np.ndarray,
            padded_token_ids: List[List[int]]) -> np.ndarray:
        """Feed the padded batch to the language model.

        Args:
            batch_attention_mask: Mask of 0s and 1s which indicate whether the token
            is a padding token or not.
            padded_token_ids: Batch of token ids for each example. The batch is padded
            and hence can be fed at once.

        Returns:
            Sequence level representations from the language model.
        """
        model_outputs = self.model(
            np.array(padded_token_ids),
            attention_mask=np.array(batch_attention_mask))

        # sequence hidden states is always the first output from all models
        sequence_hidden_states = model_outputs[0]

        sequence_hidden_states = sequence_hidden_states.numpy()
        return sequence_hidden_states

    def _validate_sequence_lengths(
        self,
        actual_sequence_lengths: List[int],
        batch_examples: List[Message],
        attribute: Text,
        inference_mode: bool = False,
    ) -> None:
        """Validate if sequence lengths of all inputs are less the max sequence length the model can handle

        This method should throw an error during training, whereas log a debug message during inference if
        any of the input examples have a length greater than maximum sequence length allowed.

        Args:
            actual_sequence_lengths: original sequence length of all inputs
            batch_examples: all message instances in the batch
            attribute: attribute of message object to be processed
            inference_mode: Whether this is during training or during inferencing
        """
        if self.max_model_sequence_length == NO_LENGTH_RESTRICTION:
            # There is no restriction on sequence length from the model
            return

        for sequence_length, example in zip(actual_sequence_lengths,
                                            batch_examples):
            if sequence_length > self.max_model_sequence_length:
                if not inference_mode:
                    raise RuntimeError(
                        f"The sequence length of '{example.get(attribute)[:20]}...' "
                        f"is too long({sequence_length} tokens) for the "
                        f"model chosen {self.model_name} which has a maximum "
                        f"sequence length of {self.max_model_sequence_length} tokens. Either "
                        f"shorten the message or use a model which has no "
                        f"restriction on input sequence length like XLNet.")
                else:
                    logger.debug(
                        f"The sequence length of '{example.get(attribute)[:20]}...' "
                        f"is too long({sequence_length} tokens) for the "
                        f"model chosen {self.model_name} which has a maximum "
                        f"sequence length of {self.max_model_sequence_length} tokens. "
                        f"Downstream model predictions may be affected because of this."
                    )

    def _add_extra_padding(self, sequence_embeddings: np.ndarray,
                           actual_sequence_lengths: List[int]) -> np.ndarray:
        """
        Add extra zero padding to match the original sequence length.

        This is only done if the input was truncated during the batch preparation of input for the model.
        Args:
            sequence_embeddings: Embeddings returned from the model
            actual_sequence_lengths: original sequence length of all inputs

        Returns:
            Modified sequence embeddings with padding if necessary
        """

        if self.max_model_sequence_length == NO_LENGTH_RESTRICTION:
            # No extra padding needed because there wouldn't have been any truncation in the first place
            return sequence_embeddings

        reshaped_sequence_embeddings = []
        for index, embedding in enumerate(sequence_embeddings):
            embedding_size = embedding.shape[-1]
            if actual_sequence_lengths[index] > self.max_model_sequence_length:
                embedding = np.concatenate([
                    embedding,
                    np.zeros(
                        (
                            actual_sequence_lengths[index] -
                            self.max_model_sequence_length,
                            embedding_size,
                        ),
                        dtype=np.float32,
                    ),
                ])
            reshaped_sequence_embeddings.append(embedding)

        return np.array(reshaped_sequence_embeddings)

    def _get_model_features_for_batch(
        self,
        batch_token_ids: List[List[int]],
        batch_tokens: List[List[Token]],
        batch_examples: List[Message],
        attribute: Text,
        inference_mode: bool = False,
    ) -> Tuple[np.ndarray, np.ndarray]:
        """Compute dense features of each example in the batch.

        We first add the special tokens corresponding to each language model. Next, we
        add appropriate padding and compute a mask for that padding so that it doesn't
        affect the feature computation. The padded batch is next fed to the language
        model and token level embeddings are computed. Using the pre-computed mask,
        embeddings for non-padding tokens are extracted and subsequently sentence
        level embeddings are computed.

        Args:
            batch_token_ids: List of token ids of each example in the batch.
            batch_tokens: List of token objects for each example in the batch.
            batch_examples: List of examples in the batch.
            attribute: attribute of the Message object to be processed.
            inference_mode: Whether the call is during training or during inference.

        Returns:
            Sentence and token level dense representations.
        """
        # Let's first add tokenizer specific special tokens to all examples
        batch_token_ids_augmented = self._add_lm_specific_special_tokens(
            batch_token_ids)

        # Compute sequence lengths for all examples
        (
            actual_sequence_lengths,
            max_input_sequence_length,
        ) = self._extract_sequence_lengths(batch_token_ids_augmented)

        # Validate that all sequences can be processed based on their sequence lengths and
        # the maximum sequence length the model can handle
        self._validate_sequence_lengths(actual_sequence_lengths,
                                        batch_examples, attribute,
                                        inference_mode)

        # Add padding so that whole batch can be fed to the model
        padded_token_ids = self._add_padding_to_batch(
            batch_token_ids_augmented, max_input_sequence_length)

        # Compute attention mask based on actual_sequence_length
        batch_attention_mask = self._compute_attention_mask(
            actual_sequence_lengths, max_input_sequence_length)

        # Get token level features from the model
        sequence_hidden_states = self._compute_batch_sequence_features(
            batch_attention_mask, padded_token_ids)

        # Extract features for only non-padding tokens
        sequence_nonpadded_embeddings = self._extract_nonpadded_embeddings(
            sequence_hidden_states, actual_sequence_lengths)

        # Extract sentence level and post-processed features
        (
            sentence_embeddings,
            sequence_embeddings,
        ) = self._post_process_sequence_embeddings(
            sequence_nonpadded_embeddings)

        # Pad zeros for examples which were truncated in inference mode.
        # This is intentionally done after sentence embeddings have been extracted so that they are not affected
        sequence_embeddings = self._add_extra_padding(sequence_embeddings,
                                                      actual_sequence_lengths)

        # shape of matrix for all sequence embeddings
        batch_dim = len(sequence_embeddings)
        seq_dim = max(e.shape[0] for e in sequence_embeddings)
        feature_dim = sequence_embeddings[0].shape[1]
        shape = (batch_dim, seq_dim, feature_dim)

        # align features with tokens so that we have just one vector per token
        # (don't include sub-tokens)
        sequence_embeddings = train_utils.align_token_features(
            batch_tokens, sequence_embeddings, shape)

        # sequence_embeddings is a padded numpy array
        # remove the padding, keep just the non-zero vectors
        sequence_final_embeddings = []
        for embeddings, tokens in zip(sequence_embeddings, batch_tokens):
            sequence_final_embeddings.append(embeddings[:len(tokens)])
        sequence_final_embeddings = np.array(sequence_final_embeddings)

        return sentence_embeddings, sequence_final_embeddings

    def _get_docs_for_batch(
        self,
        batch_examples: List[Message],
        attribute: Text,
        inference_mode: bool = False,
    ) -> List[Dict[Text, Any]]:
        """Compute language model docs for all examples in the batch.

        Args:
            batch_examples: Batch of message objects for which language model docs
            need to be computed.
            attribute: Property of message to be processed, one of ``TEXT`` or
            ``RESPONSE``.
            inference_mode: Whether the call is during inference or during training.


        Returns:
            List of language model docs for each message in batch.
        """

        batch_tokens, batch_token_ids = self._get_token_ids_for_batch(
            batch_examples, attribute)

        (
            batch_sentence_features,
            batch_sequence_features,
        ) = self._get_model_features_for_batch(batch_token_ids, batch_tokens,
                                               batch_examples, attribute,
                                               inference_mode)

        # A doc consists of
        # {'token_ids': ..., 'tokens': ..., 'sequence_features': ...,
        # 'sentence_features': ...}
        batch_docs = []
        for index in range(len(batch_examples)):
            doc = {
                TOKEN_IDS:
                batch_token_ids[index],
                TOKENS:
                batch_tokens[index],
                SEQUENCE_FEATURES:
                batch_sequence_features[index],
                SENTENCE_FEATURES:
                np.reshape(batch_sentence_features[index], (1, -1)),
            }
            batch_docs.append(doc)

        return batch_docs

    def train(
        self,
        training_data: TrainingData,
        config: Optional[RasaNLUModelConfig] = None,
        **kwargs: Any,
    ) -> None:
        """Compute tokens and dense features for each message in training data.

        Args:
            training_data: NLU training data to be tokenized and featurized
            config: NLU pipeline config consisting of all components.

        """

        batch_size = 64

        for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:

            non_empty_examples = list(
                filter(lambda x: x.get(attribute),
                       training_data.training_examples))

            batch_start_index = 0

            while batch_start_index < len(non_empty_examples):

                batch_end_index = min(batch_start_index + batch_size,
                                      len(non_empty_examples))
                # Collect batch examples
                batch_messages = non_empty_examples[
                    batch_start_index:batch_end_index]

                # Construct a doc with relevant features
                # extracted(tokens, dense_features)
                batch_docs = self._get_docs_for_batch(batch_messages,
                                                      attribute)

                for index, ex in enumerate(batch_messages):

                    ex.set(LANGUAGE_MODEL_DOCS[attribute], batch_docs[index])

                batch_start_index += batch_size

    def process(self, message: Message, **kwargs: Any) -> None:
        """Process an incoming message by computing its tokens and dense features.

        Args:
            message: Incoming message object
        """

        # process of all featurizers operates only on TEXT and ACTION_TEXT attributes,
        # because all other attributes are labels which are featurized during training
        # and their features are stored by the model itself.
        for attribute in {TEXT, ACTION_TEXT}:
            if message.get(attribute):
                message.set(
                    LANGUAGE_MODEL_DOCS[attribute],
                    self._get_docs_for_batch([message],
                                             attribute=attribute,
                                             inference_mode=True)[0],
                )
Example #25
0
class HFTransformersNLP(Component):
    """Utility Component for interfacing between Transformers library and Rasa OS.

    The transformers(https://github.com/huggingface/transformers) library
    is used to load pre-trained language models like BERT, GPT-2, etc.
    The component also tokenizes and featurizes dense featurizable attributes of each
    message.
    """

    defaults = {
        # name of the language model to load.
        "model_name": "bert",
        # Pre-Trained weights to be loaded(string)
        "model_weights": None,
    }

    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
        super(HFTransformersNLP, self).__init__(component_config)

        self._load_model()
        self.whitespace_tokenizer = WhitespaceTokenizer()

    def _load_model(self) -> None:
        """Try loading the model"""

        from rasa.nlu.utils.hugging_face.registry import (
            model_class_dict,
            model_weights_defaults,
            model_tokenizer_dict,
        )

        self.model_name = self.component_config["model_name"]

        if self.model_name not in model_class_dict:
            raise KeyError(
                f"'{self.model_name}' not a valid model name. Choose from "
                f"{str(list(model_class_dict.keys()))}or create"
                f"a new class inheriting from this class to support your model."
            )

        self.model_weights = self.component_config["model_weights"]

        if not self.model_weights:
            logger.info(
                f"Model weights not specified. Will choose default model weights: "
                f"{model_weights_defaults[self.model_name]}"
            )
            self.model_weights = model_weights_defaults[self.model_name]

        logger.debug(f"Loading Tokenizer and Model for {self.model_name}")

        self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained(
            self.model_weights
        )
        self.model = model_class_dict[self.model_name].from_pretrained(
            self.model_weights
        )

        # Use a universal pad token since all transformer architectures do not have a
        # consistent token. Instead of pad_token_id we use unk_token_id because
        # pad_token_id is not set for all architectures. We can't add a new token as
        # well since vocabulary resizing is not yet supported for TF classes.
        # Also, this does not hurt the model predictions since we use an attention mask
        # while feeding input.
        self.pad_token_id = self.tokenizer.unk_token_id

    @classmethod
    def required_packages(cls) -> List[Text]:
        return ["transformers"]

    def _lm_tokenize(self, text: Text) -> Tuple[List[int], List[Text]]:
        """
        Pass the text through the tokenizer of the language model.

        Args:
            text: Text to be tokenized.

        Returns:
            List of token ids and token strings.

        """
        split_token_ids = self.tokenizer.encode(text, add_special_tokens=False)

        split_token_strings = self.tokenizer.convert_ids_to_tokens(split_token_ids)

        return split_token_ids, split_token_strings

    def _add_lm_specific_special_tokens(
        self, token_ids: List[List[int]]
    ) -> List[List[int]]:
        """Add language model specific special tokens which were used during their training.

        Args:
            token_ids: List of token ids for each example in the batch.

        Returns:
            Augmented list of token ids for each example in the batch.
        """
        from rasa.nlu.utils.hugging_face.registry import (
            model_special_tokens_pre_processors,
        )

        augmented_tokens = [
            model_special_tokens_pre_processors[self.model_name](example_token_ids)
            for example_token_ids in token_ids
        ]
        return augmented_tokens

    def _lm_specific_token_cleanup(
        self, split_token_ids: List[int], token_strings: List[Text]
    ) -> Tuple[List[int], List[Text]]:
        """Clean up special chars added by tokenizers of language models.

        Many language models add a special char in front/back of (some) words. We clean up those chars as they are not
        needed once the features are already computed.

        Args:
            split_token_ids: List of token ids received as output from the language model specific tokenizer.
            token_strings: List of token strings received as output from the language model specific tokenizer.

        Returns:
            Cleaned up token ids and token strings.
        """
        from rasa.nlu.utils.hugging_face.registry import model_tokens_cleaners

        return model_tokens_cleaners[self.model_name](split_token_ids, token_strings)

    def _post_process_sequence_embeddings(
        self, sequence_embeddings: np.ndarray
    ) -> Tuple[np.ndarray, np.ndarray]:
        """Compute sentence level representations and sequence level representations for relevant tokens.

        Args:
            sequence_embeddings: Sequence level dense features received as output from language model.

        Returns:
            Sentence and sequence level representations.
        """

        from rasa.nlu.utils.hugging_face.registry import (
            model_embeddings_post_processors,
        )

        sentence_embeddings = []
        post_processed_sequence_embeddings = []

        for example_embedding in sequence_embeddings:
            (
                example_sentence_embedding,
                example_post_processed_embedding,
            ) = model_embeddings_post_processors[self.model_name](example_embedding)

            sentence_embeddings.append(example_sentence_embedding)
            post_processed_sequence_embeddings.append(example_post_processed_embedding)

        return (
            np.array(sentence_embeddings),
            np.array(post_processed_sequence_embeddings),
        )

    def _tokenize_example(
        self, message: Message, attribute: Text
    ) -> Tuple[List[Token], List[int]]:
        """Tokenize a single message example.

        Many language models add a special char in front of (some) words and split words into
        sub-words. To ensure the entity start and end values matches the token values,
        tokenize the text first using the whitespace tokenizer. If individual tokens
        are split up into multiple tokens, we make sure that the start and end value
        of the first and last respective tokens stay the same.

        Args:
            message: Single message object to be processed.
            attribute: Property of message to be processed, one of ``TEXT`` or ``RESPONSE``.

        Returns:
            List of token strings and token ids for the corresponding attribute of the message.
        """

        tokens_in = self.whitespace_tokenizer.tokenize(message, attribute)

        tokens_out = []

        token_ids_out = []

        for token in tokens_in:
            # use lm specific tokenizer to further tokenize the text
            split_token_ids, split_token_strings = self._lm_tokenize(token.text)

            split_token_ids, split_token_strings = self._lm_specific_token_cleanup(
                split_token_ids, split_token_strings
            )

            token_ids_out += split_token_ids

            tokens_out += train_utils.align_tokens(
                split_token_strings, token.end, token.start
            )

        return tokens_out, token_ids_out

    def _get_token_ids_for_batch(
        self, batch_examples: List[Message], attribute: Text
    ) -> Tuple[List[List[Token]], List[List[int]]]:
        """Compute token ids and token strings for each example in batch.

        A token id is the id of that token in the vocabulary of the language model.
        Args:
            batch_examples: Batch of message objects for which tokens need to be computed.
            attribute: Property of message to be processed, one of ``TEXT`` or ``RESPONSE``.

        Returns:
            List of token strings and token ids for each example in the batch.
        """

        batch_token_ids = []
        batch_tokens = []
        for example in batch_examples:

            example_tokens, example_token_ids = self._tokenize_example(
                example, attribute
            )
            batch_tokens.append(example_tokens)
            batch_token_ids.append(example_token_ids)

        return batch_tokens, batch_token_ids

    @staticmethod
    def _compute_attention_mask(actual_sequence_lengths: List[int]) -> np.ndarray:
        """Compute a mask for padding tokens.

        This mask will be used by the language model so that it does not attend to padding tokens.

        Args:
            actual_sequence_lengths: List of length of each example without any padding

        Returns:
            Computed attention mask, 0 for padding and 1 for non-padding tokens.
        """

        attention_mask = []
        max_seq_length = max(actual_sequence_lengths)
        for actual_sequence_length in actual_sequence_lengths:
            # add 1s for present tokens, fill up the remaining space up to max
            # sequence length with 0s (non-existing tokens)
            padded_sequence = [1] * actual_sequence_length + [0] * (
                max_seq_length - actual_sequence_length
            )
            attention_mask.append(padded_sequence)

        attention_mask = np.array(attention_mask).astype(np.float32)

        return attention_mask

    def _add_padding_to_batch(
        self, batch_token_ids: List[List[int]]
    ) -> Tuple[List[int], List[List[int]]]:
        """Add padding so that all examples in the batch are of the same length.

        Args:
            batch_token_ids: Batch of examples where each example is a non-padded list of token ids.

        Returns:
            Padded batch with all examples of the same length.
        """
        padded_token_ids = []
        # Compute max length across examples
        max_seq_len = 0
        actual_sequence_lengths = []

        for example_token_ids in batch_token_ids:
            actual_sequence_lengths.append(len(example_token_ids))
            max_seq_len = max(max_seq_len, len(example_token_ids))

        # Add padding according to max_seq_len
        # Some models don't contain pad token, we use unknown token as padding token.
        # This doesn't affect the computation since we compute an attention mask
        # anyways.
        for example_token_ids in batch_token_ids:
            padded_token_ids.append(
                example_token_ids
                + [self.pad_token_id] * (max_seq_len - len(example_token_ids))
            )
        return actual_sequence_lengths, padded_token_ids

    @staticmethod
    def _extract_nonpadded_embeddings(
        embeddings: np.ndarray, actual_sequence_lengths: List[int]
    ) -> np.ndarray:
        """Use pre-computed non-padded lengths of each example to extract embeddings for non-padding tokens.

        Args:
            embeddings: sequence level representations for each example of the batch.
            actual_sequence_lengths: non-padded lengths of each example of the batch.

        Returns:
            Sequence level embeddings for only non-padding tokens of the batch.
        """
        nonpadded_sequence_embeddings = []
        for index, embedding in enumerate(embeddings):
            unmasked_embedding = embedding[: actual_sequence_lengths[index]]
            nonpadded_sequence_embeddings.append(unmasked_embedding)

        return np.array(nonpadded_sequence_embeddings)

    def _compute_batch_sequence_features(
        self, batch_attention_mask: np.ndarray, padded_token_ids: List[List[int]]
    ) -> np.ndarray:
        """Feed the padded batch to the language model.

        Args:
            batch_attention_mask: Mask of 0s and 1s which indicate whether the token is a padding token or not.
            padded_token_ids: Batch of token ids for each example. The batch is padded and hence can be fed at once.

        Returns:
            Sequence level representations from the language model.
        """
        model_outputs = self.model(
            np.array(padded_token_ids), attention_mask=np.array(batch_attention_mask)
        )

        # sequence hidden states is always the first output from all models
        sequence_hidden_states = model_outputs[0]

        sequence_hidden_states = sequence_hidden_states.numpy()
        return sequence_hidden_states

    def _get_model_features_for_batch(
        self, batch_token_ids: List[List[int]]
    ) -> Tuple[np.ndarray, np.ndarray]:
        """Compute dense features of each example in the batch.

        We first add the special tokens corresponding to each language model. Next, we add appropriate padding
        and compute a mask for that padding so that it doesn't affect the feature computation. The padded batch is next
        fed to the language model and token level embeddings are computed. Using the pre-computed mask, embeddings for
        non-padding tokens are extracted and subsequently sentence level embeddings are computed.

        Args:
            batch_token_ids: List of token ids of each example in the batch.

        Returns:
            Sentence and token level dense representations.
        """
        # Let's first add tokenizer specific special tokens to all examples
        batch_token_ids_augmented = self._add_lm_specific_special_tokens(
            batch_token_ids
        )

        # Let's first add padding so that whole batch can be fed to the model
        actual_sequence_lengths, padded_token_ids = self._add_padding_to_batch(
            batch_token_ids_augmented
        )

        # Compute attention mask based on actual_sequence_length
        batch_attention_mask = self._compute_attention_mask(actual_sequence_lengths)

        # Get token level features from the model
        sequence_hidden_states = self._compute_batch_sequence_features(
            batch_attention_mask, padded_token_ids
        )

        # Extract features for only non-padding tokens
        sequence_nonpadded_embeddings = self._extract_nonpadded_embeddings(
            sequence_hidden_states, actual_sequence_lengths
        )

        # Extract sentence level and post-processed features
        (
            sentence_embeddings,
            sequence_final_embeddings,
        ) = self._post_process_sequence_embeddings(sequence_nonpadded_embeddings)

        return sentence_embeddings, sequence_final_embeddings

    def _get_docs_for_batch(
        self, batch_examples: List[Message], attribute: Text
    ) -> List[Dict[Text, Any]]:
        """Compute language model docs for all examples in the batch.

        Args:
            batch_examples: Batch of message objects for which language model docs need to be computed.
            attribute: Property of message to be processed, one of ``TEXT`` or ``RESPONSE``.

        Returns:
            List of language model docs for each message in batch.
        """

        batch_tokens, batch_token_ids = self._get_token_ids_for_batch(
            batch_examples, attribute
        )

        (
            batch_sentence_features,
            batch_sequence_features,
        ) = self._get_model_features_for_batch(batch_token_ids)

        # A doc consists of
        # {'token_ids': ..., 'tokens': ..., 'sequence_features': ..., 'sentence_features': ...}
        batch_docs = []
        for index in range(len(batch_examples)):
            doc = {
                TOKEN_IDS: batch_token_ids[index],
                TOKENS: batch_tokens[index],
                SEQUENCE_FEATURES: batch_sequence_features[index],
                SENTENCE_FEATURES: np.reshape(batch_sentence_features[index], (1, -1)),
            }
            batch_docs.append(doc)

        return batch_docs

    def train(
        self,
        training_data: TrainingData,
        config: Optional[RasaNLUModelConfig] = None,
        **kwargs: Any,
    ) -> None:
        """Compute tokens and dense features for each message in training data.

        Args:
            training_data: NLU training data to be tokenized and featurized
            config: NLU pipeline config consisting of all components.

        """

        batch_size = 64

        for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:

            non_empty_examples = list(
                filter(lambda x: x.get(attribute), training_data.training_examples)
            )

            batch_start_index = 0

            while batch_start_index < len(non_empty_examples):

                batch_end_index = min(
                    batch_start_index + batch_size, len(non_empty_examples)
                )
                # Collect batch examples
                batch_messages = non_empty_examples[batch_start_index:batch_end_index]

                # Construct a doc with relevant features extracted(tokens, dense_features)
                batch_docs = self._get_docs_for_batch(batch_messages, attribute)

                for index, ex in enumerate(batch_messages):

                    ex.set(LANGUAGE_MODEL_DOCS[attribute], batch_docs[index])

                batch_start_index += batch_size

    def process(self, message: Message, **kwargs: Any) -> None:
        """Process an incoming message by computing its tokens and dense features.

        Args:
            message: Incoming message object
        """

        message.set(
            LANGUAGE_MODEL_DOCS[TEXT],
            self._get_docs_for_batch([message], attribute=TEXT)[0],
        )
Example #26
0
class HFTransformersNLP(Component):
    """Utility Component for interfacing between Transformers library.

    The transformers(https://github.com/huggingface/transformers) library
    is used to load pre-trained language models like BERT, GPT-2, etc.
    The component also tokenizes and featurizes dense featurizable attributes of each
    message.
    """

    defaults = {
        # name of the language model to load.
        "model_name": "bert",
        # Pre-Trained weights to be loaded(string)
        "model_weights": None,
    }

    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
        super(HFTransformersNLP, self).__init__(component_config)

        self._load_model()
        self.whitespace_tokenizer = WhitespaceTokenizer()

    def _load_model(self) -> None:
        """Try loading the model"""

        from rasa.nlu.utils.hugging_face.registry import (
            model_class_dict,
            model_weights_defaults,
            model_tokenizer_dict,
        )

        self.model_name = self.component_config["model_name"]

        if self.model_name not in model_class_dict:
            raise KeyError(
                f"'{self.model_name}' not a valid model name. Choose from "
                f"{str(list(model_class_dict.keys()))}or create"
                f"a new class inheriting from this class to support your model."
            )

        self.model_weights = self.component_config["model_weights"]

        if not self.model_weights:
            logger.info(
                f"Model weights not specified. Will choose default model weights: "
                f"{model_weights_defaults[self.model_name]}"
            )
            self.model_weights = model_weights_defaults[self.model_name]

        logger.debug(f"Loading Tokenizer and Model for {self.model_name}")

        self.tokenizer = model_tokenizer_dict[self.model_name].from_pretrained(
            self.model_weights
        )
        self.model = model_class_dict[self.model_name].from_pretrained(
            self.model_weights
        )

        # Use a universal pad token since all transformer architectures do not have a
        # consistent token. Instead of pad_token_id we use unk_token_id because
        # pad_token_id is not set for all architectures. We can't add a new token as
        # well since vocabulary resizing is not yet supported for TF classes.
        # Also, this does not hurt the model predictions since we use an attention mask
        # while feeding input.
        self.pad_token_id = self.tokenizer.unk_token_id

    @classmethod
    def required_packages(cls) -> List[Text]:
        return ["transformers"]

    def _lm_tokenize(self, text: Text) -> Tuple[List[int], List[Text]]:
        split_token_ids = self.tokenizer.encode(text, add_special_tokens=False)

        split_token_strings = self.tokenizer.convert_ids_to_tokens(split_token_ids)

        return split_token_ids, split_token_strings

    def _add_lm_specific_special_tokens(
        self, token_ids: List[List[int]]
    ) -> List[List[int]]:
        from rasa.nlu.utils.hugging_face.registry import (
            model_special_tokens_pre_processors,
        )

        augmented_tokens = [
            model_special_tokens_pre_processors[self.model_name](example_token_ids)
            for example_token_ids in token_ids
        ]
        return augmented_tokens

    def _lm_specific_token_cleanup(self, token_strings: List[Text]) -> List[Text]:
        from rasa.nlu.utils.hugging_face.registry import model_tokens_cleaners

        return model_tokens_cleaners[self.model_name](token_strings)

    def _post_process_sequence_embeddings(
        self, sequence_embeddings: np.ndarray
    ) -> Tuple[np.ndarray, np.ndarray]:

        from rasa.nlu.utils.hugging_face.registry import (
            model_embeddings_post_processors,
        )

        sentence_embeddings = []
        post_processed_sequence_embeddings = []

        for example_embedding in sequence_embeddings:
            (
                example_sentence_embedding,
                example_post_processed_embedding,
            ) = model_embeddings_post_processors[self.model_name](example_embedding)

            sentence_embeddings.append(example_sentence_embedding)
            post_processed_sequence_embeddings.append(example_post_processed_embedding)

        return (
            np.array(sentence_embeddings),
            np.array(post_processed_sequence_embeddings),
        )

    def _tokenize_example(
        self, message: Message, attribute: Text
    ) -> Tuple[List[Token], List[int]]:

        tokens_in = self.whitespace_tokenizer.tokenize(message, attribute)

        tokens_out = []

        token_ids_out = []

        for token in tokens_in:
            # use lm specific tokenizer to further tokenize the text
            split_token_ids, split_token_strings = self._lm_tokenize(token.text)

            split_token_strings = self._lm_specific_token_cleanup(split_token_strings)

            token_ids_out += split_token_ids

            tokens_out += train_utils.align_tokens(
                split_token_strings, token.end, token.start
            )

        return tokens_out, token_ids_out

    def _get_token_ids_for_batch(
        self, batch_examples: List[Message], attribute: Text
    ) -> Tuple[List[List[Token]], List[List[int]]]:

        batch_token_ids = []
        batch_tokens = []
        for example in batch_examples:

            example_tokens, example_token_ids = self._tokenize_example(
                example, attribute
            )
            batch_tokens.append(example_tokens)
            batch_token_ids.append(example_token_ids)

        return batch_tokens, batch_token_ids

    @staticmethod
    def _compute_attention_mask(actual_sequence_lengths: List[int]) -> np.ndarray:

        attention_mask = []
        max_seq_length = max(actual_sequence_lengths)
        for actual_sequence_length in actual_sequence_lengths:
            # add 1s for present tokens, fill up the remaining space up to max
            # sequence length with 0s (non-existing tokens)
            padded_sequence = [1] * actual_sequence_length + [0] * (
                max_seq_length - actual_sequence_length
            )
            attention_mask.append(padded_sequence)

        attention_mask = np.array(attention_mask).astype(np.float32)

        return attention_mask

    def _add_padding_to_batch(
        self, batch_token_ids: List[List[int]]
    ) -> Tuple[List[int], List[List[int]]]:
        padded_token_ids = []
        # Compute max length across examples
        max_seq_len = 0
        actual_sequence_lengths = []

        for example_token_ids in batch_token_ids:
            actual_sequence_lengths.append(len(example_token_ids))
            max_seq_len = max(max_seq_len, len(example_token_ids))

        # Add padding according to max_seq_len
        # Some models don't contain pad token, we use unknown token as padding token.
        # This doesn't affect the computation since we compute an attention mask
        # anyways.
        for example_token_ids in batch_token_ids:
            padded_token_ids.append(
                example_token_ids
                + [self.pad_token_id] * (max_seq_len - len(example_token_ids))
            )
        return actual_sequence_lengths, padded_token_ids

    @staticmethod
    def _extract_nonpadded_embeddings(
        embeddings: np.ndarray, actual_sequence_lengths: List[int]
    ) -> np.ndarray:
        nonpadded_sequence_embeddings = []
        for index, embedding in enumerate(embeddings):
            unmasked_embedding = embedding[: actual_sequence_lengths[index]]
            nonpadded_sequence_embeddings.append(unmasked_embedding)

        return np.array(nonpadded_sequence_embeddings)

    def _compute_batch_sequence_features(
        self, batch_attention_mask: np.ndarray, padded_token_ids: List[List[int]]
    ) -> np.ndarray:
        model_outputs = self.model(
            np.array(padded_token_ids), attention_mask=np.array(batch_attention_mask)
        )

        # sequence hidden states is always the first output from all models
        sequence_hidden_states = model_outputs[0]

        sequence_hidden_states = sequence_hidden_states.numpy()
        return sequence_hidden_states

    def _get_model_features_for_batch(
        self, batch_token_ids: List[List[int]]
    ) -> Tuple[np.ndarray, np.ndarray]:
        # Let's first add tokenizer specific special tokens to all examples
        batch_token_ids_augmented = self._add_lm_specific_special_tokens(
            batch_token_ids
        )

        # Let's first add padding so that whole batch can be fed to the model
        actual_sequence_lengths, padded_token_ids = self._add_padding_to_batch(
            batch_token_ids_augmented
        )

        # Compute attention mask based on actual_sequence_length
        batch_attention_mask = self._compute_attention_mask(actual_sequence_lengths)

        # Get token level features from the model
        sequence_hidden_states = self._compute_batch_sequence_features(
            batch_attention_mask, padded_token_ids
        )

        # Extract features for only non-padding tokens
        sequence_nonpadded_embeddings = self._extract_nonpadded_embeddings(
            sequence_hidden_states, actual_sequence_lengths
        )

        # Extract sentence level and post-processed features
        (
            sentence_embeddings,
            sequence_final_embeddings,
        ) = self._post_process_sequence_embeddings(sequence_nonpadded_embeddings)

        return sentence_embeddings, sequence_final_embeddings

    def _get_docs_for_batch(
        self, batch_examples: List[Message], attribute: Text
    ) -> List[Dict[Text, Any]]:

        batch_tokens, batch_token_ids = self._get_token_ids_for_batch(
            batch_examples, attribute
        )

        (
            batch_sentence_features,
            batch_sequence_features,
        ) = self._get_model_features_for_batch(batch_token_ids)

        # A doc consists of
        # {'token_ids': ..., 'tokens': ..., 'sequence_features': ..., 'sentence_features': ...}
        batch_docs = []
        for index in range(len(batch_examples)):
            doc = {
                TOKEN_IDS: batch_token_ids[index],
                TOKENS: batch_tokens[index],
                SEQUENCE_FEATURES: batch_sequence_features[index],
                SENTENCE_FEATURES: np.reshape(batch_sentence_features[index], (1, -1)),
            }
            batch_docs.append(doc)

        return batch_docs

    def train(
        self,
        training_data: TrainingData,
        config: Optional[RasaNLUModelConfig] = None,
        **kwargs: Any,
    ) -> None:

        batch_size = 64

        for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:

            non_empty_examples = list(
                filter(lambda x: x.get(attribute), training_data.training_examples)
            )

            batch_start_index = 0

            while batch_start_index < len(non_empty_examples):

                batch_end_index = min(
                    batch_start_index + batch_size, len(non_empty_examples)
                )
                # Collect batch examples
                batch_messages = non_empty_examples[batch_start_index:batch_end_index]

                # Construct a doc with relevant features extracted(tokens, dense_features)
                batch_docs = self._get_docs_for_batch(batch_messages, attribute)

                for index, ex in enumerate(batch_messages):

                    ex.set(LANGUAGE_MODEL_DOCS[attribute], batch_docs[index])

                batch_start_index += batch_size

    def process(self, message: Message, **kwargs: Any) -> None:

        message.set(
            LANGUAGE_MODEL_DOCS[TEXT],
            self._get_docs_for_batch([message], attribute=TEXT)[0],
        )
Example #27
0
from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
from rasa.nlu.tokenizers.lm_tokenizer import LanguageModelTokenizer
from rasa.nlu.utils.hugging_face.hf_transformers import HFTransformersNLP
from rasa.nlu.training_data import Message
from rasa.nlu.constants import (TEXT, SPACY_DOCS)

logger = logging_setup()

test_input = "Okay, pick up this yellow banana for me."
message = Message(test_input)

tk = WhitespaceTokenizer()
tokens = tk.tokenize(message, attribute=TEXT)
logger.info('Whitespace: {}'.format([t.text for t in tokens]))

tk = SpacyTokenizer()

message.set(SPACY_DOCS[TEXT], spacy_nlp(test_input))
tokens = tk.tokenize(message, attribute=TEXT)
logger.info('SpaCy: {}'.format([t.text for t in tokens]))

tk = MitieTokenizer()
tokens = tk.tokenize(message, attribute=TEXT)
logger.info('Mitie: {}'.format([t.text for t in tokens]))

tk = ConveRTTokenizer()
tokens = tk.tokenize(message, attribute=TEXT)
logger.info('ConveRT: {}'.format([t.text for t in tokens]))
Example #28
0
def edit_tf_pt():
    if request.method == 'POST':
        if 'create' in request.form: #create new data folder
            folder_name = request.form['new_name']
            folder_path = os.path.join(app.config['UPLOAD_FOLDER'],secure_filename(folder_name))
            label_path = os.path.join(folder_path,'label')
            text_path = os.path.join(folder_path,'seq.in')
            tags_path = os.path.join(folder_path,'seq.out')
            if not os.path.exists(folder_path): #create new folder & files if dont exist
                os.makedirs(folder_path)
                os.mknod(label_path)
                os.mknod(text_path)
                os.mknod(tags_path)
            else: #create files in folder if dont exist
                if not os.path.exists(label_path):
                    os.mknod(label_path)
                if not os.path.exists(text_path):
                    os.mknod(text_path)
                if not os.path.exists(tags_path):
                    os.mknod(tags_path)
            return redirect(url_for('content_tf_pt',path=folder_path))
        
        elif 'open' in request.form: #edit existing data folder
            #download multiple files from the folder
            list_folder = request.files.getlist('folder') #list()
            #check if folder contains correct files
            file_check = {'label':0, 'seq.in':0, 'seq.out':0}
            for file in list_folder:
                if os.path.basename(file.filename) in file_check:
                    file_check[os.path.basename(file.filename)] = file_check[os.path.basename(file.filename)] + 1           
            if 0 in file_check.values(): #check if filenames meet requirement
                fail = True
                fail_message = 'Files uploaded do not match filename requirements. Please check if your label, text sequence and BIO-tag sequence files are named as label, seq.in and seq.out respectively for system to recognise.'
                return redirect(url_for('edit_tf_pt',fail=fail,fail_message=fail_message))
            elif not all([False for value in file_check.values() if value>1]): #invalid data folder: contains more than one of each label,seq.in,seq.out files
                fail = True
                fail_message = 'Invalid folder selected! Folder contains more than required number of files (3). Please select the direct parent data folder with only one instance of label, seq.in and seq.out file.'
                return redirect(url_for('edit_tf_pt',fail=fail,fail_message=fail_message))
            else: #success
                for file in list_folder:
                    file.save(os.path.join(app.config['UPLOAD_FOLDER'],file.filename)) #save files into folder
                folder_path = os.path.join(app.config['UPLOAD_FOLDER'],os.path.dirname(list_folder[0].filename))
                return redirect(url_for('content_tf_pt',path=folder_path))
        
        elif 'convert_rasa' in request.form: #convert rasa data file to tf/pt format
            from rasa.nlu import training_data, load_data
            from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer

            curr = request.files['convert_rasa_file']
            curr.save(os.path.join(app.config['UPLOAD_FOLDER'],secure_filename(curr.filename)))
            file = os.path.join(app.config['UPLOAD_FOLDER'],secure_filename(curr.filename))

            td = training_data.load_data(file)
            formatted_examples = [ example.as_dict_nlu() for example in td.training_examples ]
            labels = [ex['intent'] for ex in formatted_examples]

            #Tokenize and clean text
            white_space_tokenizer = WhitespaceTokenizer()
            sentences = list()
            BIO_tagging = list()
            types = dict()
            for ex in formatted_examples:
                #Tokenize by white space
                white_space_tokens = white_space_tokenizer.tokenize(ex['text'])
                tokens = [token.text for token in white_space_tokens]
                #Form into input sentence
                sentence = ' '.join(tokens)
                sentences.append(sentence) #seq.in
                #Perform entity tagging
                if 'entities' in ex: #entity exists
                    ent_values = [entity['value'] for entity in ex['entities']] #entity value
                    ent_length = [len(value.split()) for value in ent_values] #length of entity word
                    ent_types = [entity['entity'] for entity in ex['entities']] #entity type
                    #form BI tags
                    for idx, typ in enumerate(ent_types):
                        ent_types[idx] = 'B-' + typ + ''.join([' I-' + typ]*(ent_length[idx] - 1))
                        types['B-' + typ] = True
                        types['I-' + typ] = True
                        #replace sentence with BI
                        sentence = sentence.replace(ent_values[idx].strip(),ent_types[idx].strip()) #and, remove leading and trailing spaces
                    tag_seq = sentence.split()
                    for idx, token in enumerate(tag_seq):
                        #replace sentence with O
                        if token not in types:
                            tag_seq[idx] = 'O'
                #no entity
                else: 
                    tag_seq = ['O' for t in tokens]
                tags = ' '.join(tag_seq)
                BIO_tagging.append(tags)
            
            file_chunk = {
                'folder_name':os.path.splitext(os.path.basename(file))[0],
                'label_name':'label',
                'text_name':'seq.in',
                'tags_name':'seq.out',
                'label_content':'\n'.join([str(i) for i in labels]) + '\n',
                'text_content':'\n'.join([str(i) for i in sentences]) + '\n',
                'tags_content':'\n'.join([str(i) for i in BIO_tagging]) + '\n'
            }
            return render_template('/edit/editor_3.html', **file_chunk) 
        
        else: #convert tf/pt data file to rasa format
            #download multiple files from the folder
            list_folder = request.files.getlist('convert_tf_pt_folder') #list()
            #check if folder contains correct files
            file_check = {'label':0, 'seq.in':0, 'seq.out':0}
            for file in list_folder:
                if os.path.basename(file.filename) in file_check:
                    file_check[os.path.basename(file.filename)] = file_check[os.path.basename(file.filename)] + 1           
            if 0 in file_check.values(): #check if filenames meet requirement
                fail = True
                fail_message = 'Files uploaded do not match filename requirements. Please check if your label, text sequence and BIO-tag sequence files are named as label, seq.in and seq.out respectively for system to recognise.'
                return redirect(url_for('edit_tf_pt',fail=fail,fail_message=fail_message))
            elif not all([False for value in file_check.values() if value>1]): #invalid data folder: contains more than one of each label,seq.in,seq.out files
                fail = True
                fail_message = 'Invalid folder selected! Folder contains more than required number of files (3). Please select the direct parent data folder with only one instance of label, seq.in and seq.out file.'
                return redirect(url_for('edit_tf_pt',fail=fail,fail_message=fail_message))
            else: #success
                for file in list_folder:
                    file.save(os.path.join(app.config['UPLOAD_FOLDER'],file.filename)) #save files into folder
                folder_path = os.path.join(app.config['UPLOAD_FOLDER'],os.path.dirname(list_folder[0].filename))
                return redirect(url_for('content_to_rasa',path=folder_path))
    
    else:
        if 'fail' in request.args:
            fail = request.args.get('fail')
            fail_msg = request.args.get('fail_message')
        else:
            fail = False
            fail_msg = ""
        return render_template('/edit/index_tf-pt.html',fail=fail,fail_message=fail_msg)