コード例 #1
0
    def _convert_to_crf_tokens(self, message: Message) -> List[CRFToken]:
        """Take a message and convert it to crfsuite format."""

        crf_format = []
        tokens = train_utils.tokens_without_cls(message)

        text_dense_features = self._get_dense_features(message)
        tags = self._get_tags(message)

        for i, token in enumerate(tokens):
            pattern = self._pattern_of_token(message, i)
            entity = self.get_tag_for(tags, ENTITY_ATTRIBUTE_TYPE, i)
            group = self.get_tag_for(tags, ENTITY_ATTRIBUTE_GROUP, i)
            role = self.get_tag_for(tags, ENTITY_ATTRIBUTE_ROLE, i)
            pos_tag = token.get(POS_TAG_KEY)
            dense_features = (text_dense_features[i]
                              if text_dense_features is not None else [])

            crf_format.append(
                CRFToken(
                    text=token.text,
                    pos_tag=pos_tag,
                    entity_tag=entity,
                    entity_group_tag=group,
                    entity_role_tag=role,
                    pattern=pattern,
                    dense_features=dense_features,
                ))

        return crf_format
コード例 #2
0
ファイル: bilou_utils.py プロジェクト: zylhub/rasa
def apply_bilou_schema(training_data: TrainingData,
                       include_cls_token: bool = True) -> None:
    """Get a list of BILOU entity tags and set them on the given messages.

    Args:
        training_data: the training data
    """
    for message in training_data.training_examples:
        entities = message.get(ENTITIES)

        if not entities:
            continue

        tokens = message.get(TOKENS_NAMES[TEXT])
        if not include_cls_token:
            tokens = train_utils.tokens_without_cls(message)

        for attribute, message_key in [
            (ENTITY_ATTRIBUTE_TYPE, BILOU_ENTITIES),
            (ENTITY_ATTRIBUTE_ROLE, BILOU_ENTITIES_ROLE),
            (ENTITY_ATTRIBUTE_GROUP, BILOU_ENTITIES_GROUP),
        ]:
            entities = map_message_entities(message, attribute)
            output = bilou_tags_from_offsets(tokens, entities)
            message.set(message_key, output)
コード例 #3
0
    def extract_entities(self, message: Message) -> List[Dict[Text, Any]]:
        """Extract entities from the given message using the trained model(s)."""

        if self.entity_taggers is None:
            return []

        tokens = train_utils.tokens_without_cls(message)
        crf_tokens = self._convert_to_crf_tokens(message)

        predictions = {}
        for tag_name, entity_tagger in self.entity_taggers.items():
            # use predicted entity tags as features for second level CRFs
            include_tag_features = tag_name != ENTITY_ATTRIBUTE_TYPE
            if include_tag_features:
                self._add_tag_to_crf_token(crf_tokens, predictions)

            features = self._crf_tokens_to_features(crf_tokens,
                                                    include_tag_features)
            predictions[tag_name] = entity_tagger.predict_marginals_single(
                features)

        # convert predictions into a list of tags and a list of confidences
        tags, confidences = self._tag_confidences(tokens, predictions)

        return self.convert_predictions_into_entities(message.text, tokens,
                                                      tags, confidences)
コード例 #4
0
    def set_gensim_features(self,
                            message: Message,
                            attribute: Text = TEXT) -> None:
        tokens = message.get(TOKENS_NAMES[attribute])

        if not tokens:
            return None

        # If we key is not available then we featuizer it with an array of zeros
        word_vectors = [
            self.kv[t.text]
            if t.text in self.kv else np.zeros(self.kv.vector_size)
            for t in train_utils.tokens_without_cls(message, attribute)
        ]

        # Sum up all the word vectors so that we have one for __CLS__
        text_vector = reduce(lambda a, b: a + b, word_vectors)
        X = np.array(word_vectors +
                     [text_vector])  # remember, we need one for __CLS__

        features = self._combine_with_existing_dense_features(
            message,
            additional_features=X,
            feature_name=DENSE_FEATURE_NAMES[attribute])
        message.set(DENSE_FEATURE_NAMES[attribute], features)
コード例 #5
0
ファイル: convert_featurizer.py プロジェクト: l3SC/rasa
    def _compute_sequence_encodings(
        self, batch_examples: List[Message], module: Any, attribute: Text = TEXT
    ) -> Tuple[np.ndarray, List[int]]:
        list_of_tokens = [
            train_utils.tokens_without_cls(example, attribute)
            for example in batch_examples
        ]

        number_of_tokens_in_sentence = [
            len(sent_tokens) for sent_tokens in list_of_tokens
        ]

        # join the tokens to get a clean text to ensure the sequence length of
        # the returned embeddings from ConveRT matches the length of the tokens
        # (including sub-tokens)
        tokenized_texts = self._tokens_to_text(list_of_tokens)
        token_features = self._sequence_encoding_of_text(tokenized_texts, module)

        # ConveRT might split up tokens into sub-tokens
        # take the mean of the sub-token vectors and use that as the token vector
        token_features = train_utils.align_token_features(
            list_of_tokens, token_features
        )

        return token_features, number_of_tokens_in_sentence
コード例 #6
0
    def _prepare_mitie_sample(training_example: Message) -> Any:
        import mitie

        text = training_example.text
        tokens = train_utils.tokens_without_cls(training_example)
        sample = mitie.ner_training_instance([t.text for t in tokens])
        for ent in training_example.get(ENTITIES, []):
            try:
                # if the token is not aligned an exception will be raised
                start, end = MitieEntityExtractor.find_entity(
                    ent, text, tokens)
            except ValueError as e:
                raise_warning(f"Failed to use example '{text}' to train MITIE "
                              f"entity extractor. Example will be skipped."
                              f"Error: {e}")
                continue
            try:
                # mitie will raise an exception on malicious
                # input - e.g. on overlapping entities
                sample.add_entity(list(range(start, end)), ent["entity"])
            except Exception as e:
                raise_warning(f"Failed to add entity example "
                              f"'{str(e)}' of sentence '{str(text)}'. "
                              f"Example will be ignored. Reason: "
                              f"{e}")
                continue
        return sample
コード例 #7
0
    def process(self, message: Message, **kwargs: Any) -> None:
        mitie_feature_extractor = self._mitie_feature_extractor(**kwargs)
        tokens = train_utils.tokens_without_cls(message)
        features = self.features_for_tokens(tokens, mitie_feature_extractor)

        final_features = Features(
            features, TEXT, self.component_config[FEATURIZER_CLASS_ALIAS])
        message.add_features(final_features)
コード例 #8
0
ファイル: mitie_featurizer.py プロジェクト: zylhub/rasa
    def process(self, message: Message, **kwargs: Any) -> None:

        mitie_feature_extractor = self._mitie_feature_extractor(**kwargs)
        tokens = train_utils.tokens_without_cls(message)
        features = self.features_for_tokens(tokens, mitie_feature_extractor)
        message.set(
            DENSE_FEATURE_NAMES[TEXT],
            self._combine_with_existing_dense_features(
                message, features, DENSE_FEATURE_NAMES[TEXT]),
        )
コード例 #9
0
ファイル: mitie_featurizer.py プロジェクト: zylhub/rasa
 def process_training_example(self, example: Message, attribute: Text,
                              mitie_feature_extractor: Any):
     tokens = train_utils.tokens_without_cls(example, attribute)
     if tokens is not None:
         features = self.features_for_tokens(tokens,
                                             mitie_feature_extractor)
         example.set(
             DENSE_FEATURE_NAMES[attribute],
             self._combine_with_existing_dense_features(
                 example, features, DENSE_FEATURE_NAMES[attribute]),
         )
コード例 #10
0
    def process_training_example(self, example: Message, attribute: Text,
                                 mitie_feature_extractor: Any):
        tokens = train_utils.tokens_without_cls(example, attribute)

        if tokens is not None:
            features = self.features_for_tokens(tokens,
                                                mitie_feature_extractor)

            final_features = Features(
                features, attribute,
                self.component_config[FEATURIZER_CLASS_ALIAS])
            example.add_features(final_features)
コード例 #11
0
    def set_fasttext_features(self, message: Message, attribute: Text = TEXT) -> None:
        tokens = message.get(TOKENS_NAMES[attribute])

        if not tokens:
            return None

        text_vector = self.model.get_word_vector(message.text)
        word_vectors = [
            self.model.get_word_vector(t.text)
            for t in train_utils.tokens_without_cls(message, attribute)
        ]
        X = np.array(word_vectors + [text_vector])  # remember, we need one for __CLS__

        features = self._combine_with_existing_dense_features(
            message, additional_features=X, feature_name=DENSE_FEATURE_NAMES[attribute]
        )
        message.set(DENSE_FEATURE_NAMES[attribute], features)
コード例 #12
0
    def process(self, message: Message, **kwargs: Any) -> None:

        mitie_feature_extractor = kwargs.get("mitie_feature_extractor")
        if not mitie_feature_extractor:
            raise Exception("Failed to train 'MitieFeaturizer'. "
                            "Missing a proper MITIE feature extractor.")

        ents = self.extract_entities(
            message.text,
            train_utils.tokens_without_cls(message),
            mitie_feature_extractor,
        )
        extracted = self.add_extractor_name(ents)
        extracted = self.clean_up_entities(message, extracted)
        message.set(ENTITIES,
                    message.get(ENTITIES, []) + extracted,
                    add_to_output=True)
コード例 #13
0
    def set_bpemb_features(self,
                           message: Message,
                           attribute: Text = TEXT) -> None:
        tokens = message.get(TOKENS_NAMES[attribute])

        if not tokens:
            return None

        text_vector = self.create_word_vector(document=message.text)
        word_vectors = [
            self.create_word_vector(document=t.text)
            for t in train_utils.tokens_without_cls(message, attribute)
        ]
        X = np.array(word_vectors + [text_vector])

        features = self._combine_with_existing_dense_features(
            message,
            additional_features=X,
            feature_name=DENSE_FEATURE_NAMES[attribute])
        message.set(DENSE_FEATURE_NAMES[attribute], features)
コード例 #14
0
    def _create_feature_to_idx_dict(
            self, training_data: TrainingData) -> Dict[Text, Dict[Text, int]]:
        """Create dictionary of all feature values.

        Each feature key, defined in the component configuration, points to
        different feature values and their indices in the overall resulting
        feature vector.
        """

        # get all possible feature values
        all_features = []
        for example in training_data.training_examples:
            tokens_without_cls = train_utils.tokens_without_cls(example)
            all_features.append(self._tokens_to_features(tokens_without_cls))

        # build vocabulary of features
        feature_vocabulary = self._build_feature_vocabulary(all_features)

        # assign a unique index to each feature value
        return self._map_features_to_indices(feature_vocabulary)
コード例 #15
0
    def _get_tags(self, message: Message) -> Dict[Text, List[Text]]:
        """Get assigned entity tags of message."""
        tokens = train_utils.tokens_without_cls(message)
        tags = {}

        for tag_name in self.crf_order:
            if self.component_config[BILOU_FLAG]:
                bilou_key = bilou_utils.get_bilou_key_for_tag(tag_name)
                if message.get(bilou_key):
                    _tags = message.get(bilou_key)
                else:
                    _tags = [NO_ENTITY_TAG for _ in tokens]
            else:
                _tags = [
                    determine_token_labels(token,
                                           message.get(ENTITIES),
                                           attribute_key=tag_name)
                    for token in tokens
                ]
            tags[tag_name] = _tags

        return tags