Ejemplo n.º 1
0
    def _get_message_text(self, message: Message):
        """tokens should obatined from fore-pipeline expected"""

        if message.get("tokens"):  # if directly tokens is provided
            return ' '.join([t.text for t in message.get("tokens")])
        else:
            return message.text
Ejemplo n.º 2
0
    def process(self, message: Message, **kwargs) -> None:
        intent = {"name": None, "confidence": 0.}
        intent_ranking = []

        tokenizer = load_pretrained_tokenizer(self.pre_path)
        decode_pipeline = TrainingPipeLine(device=self.device,
                                           int2idx=self.int2idx,
                                           idx2int=self.idx2int)

        if message.text.strip():
            score, label = decode_pipeline.decode(model=self.model,
                                                  tokenizer=tokenizer,
                                                  max_len=self.max_seq_len,
                                                  text=message.text,
                                                  ranks=INTENT_RANKING_LENGTH)

            intent = {"name": label[0], "confidence": score[0]}

            intent_ranking = [{
                "name": x,
                "confidence": y
            } for x, y in zip(label[1:], score[1:])]

        message.set("intent", intent, add_to_output=True)
        message.set("intent_ranking", intent_ranking, add_to_output=True)
Ejemplo n.º 3
0
    def process(self, message: Message, **kwargs: Any):
        """pass"""
        intent = {"name": None, "confidence": 0.}
        intent_ranking = []

        if self.graph is None:
            logger.error("No model loaded")

        else:
            feature = {
                "x": message.get("text_features").reshape(1, -1),
                "y": np.stack([self.encoded_intents_bag for _ in range(1)]),
                "intent": np.array([0])
            }

            sim, a, b = self.session.run(
                [
                    self.graph.get_tensor_by_name("sim:0"),
                    self.graph.get_tensor_by_name("text_embed:0"),
                    self.graph.get_tensor_by_name("intent_embed:0")
                ],
                feed_dict={
                    self.graph.get_tensor_by_name("text_in:0"): feature["x"],
                    self.graph.get_tensor_by_name("intent_in:0"): feature["y"],
                    self.graph.get_tensor_by_name("label_in:0"):
                    feature["intent"],
                    self.graph.get_tensor_by_name("dropout:0"): 1.
                })

            if self.component_config["similarity_type"] == "cosine":
                sim[sim < 0.] = 0.

            elif self.component_config["similarity_type"] == "inner":
                sim = np.exp(sim) / np.sum(sim)

            sim = sim.flatten()
            intent_ids = np.argsort(sim)[::-1]

            intent = {
                "name": self.idx2int[intent_ids[0]],
                "confidence": sim[intent_ids[0]].tolist()
            }

            intent_ranking = [{
                "name": self.idx2int[intent_ids[idx]],
                "confidence": sim[idx].tolist()
            } for idx in intent_ids[:INTENT_RANKING_LENGTH]]

        message.set("intent", intent, add_to_output=True)
        message.set("intent_ranking", intent_ranking, add_to_output=True)
Ejemplo n.º 4
0
    def process(self, message: Message, **kwargs) -> None:
        """prediction"""

        if self.vec is None:
            logger.error(
                "No model loade, and count vectors features pipeline ignored")
            return

        msg = self._get_message_text(message)

        fea = self.vec.transform([msg]).toarray().squeeze()

        message.set("text_features",
                    self._combine_with_existing_text_features(message, fea))
Ejemplo n.º 5
0
    def read_from_json(self, js, **kwargs):
        """Loads training data stored in the rasa NLU data format."""
        validate_rasa_nlu_data(js)

        data = js['rasa_nlu_data']
        common_examples = data.get("common_examples", [])
        intent_examples = data.get("intent_examples", [])
        entity_examples = data.get("entity_examples", [])
        entity_synonyms = data.get("entity_synonyms", [])
        regex_features = data.get("regex_features", [])
        regex_intent = data.get("regex_intent", [])

        entity_synonyms = transform_entity_synonyms(entity_synonyms)

        if intent_examples or entity_examples:
            logger.warn("DEPRECATION warning: your rasa data "
                        "contains 'intent_examples' "
                        "or 'entity_examples' which will be "
                        "removed in the future. Consider "
                        "putting all your examples "
                        "into the 'common_examples' section.")

        all_examples = common_examples + intent_examples + entity_examples
        training_examples = []
        for ex in all_examples:
            msg = Message.build(ex['text'], ex.get("intent"),
                                ex.get("entities"))
            training_examples.append(msg)

        return TrainingData(training_examples, entity_synonyms, regex_features, regex_intent)
Ejemplo n.º 6
0
    def read_from_json(self, js, **kwargs):
        # type: (Text, Any) -> TrainingData
        """Loads training data stored in the WIT.ai data format."""

        training_examples = []

        for s in js["data"]:
            entities = s.get("entities")
            if entities is None:
                continue
            text = s.get("text")
            intents = [e["value"] for e in entities if e["entity"] == 'intent']
            intent = intents[0].strip("\"") if intents else None

            entities = [e
                        for e in entities
                        if ("start" in e and "end" in e and
                            e["entity"] != 'intent')]
            for e in entities:
                # for some reason wit adds additional quotes around entity values
                e["value"] = e["value"].strip("\"")

            data = {}
            if intent:
                data["intent"] = intent
            if entities is not None:
                data["entities"] = entities
            training_examples.append(Message(text, data))
        return TrainingData(training_examples)
Ejemplo n.º 7
0
    def process(self, message: Message, **kwargs: Any):
        """pass"""

        if self.sess is None:
            logger.error(
                "`BiLSTM-CRF projection model` not trained correctly,"
                "components will pass and pipeline procedure continue")
            return

        res = []

        _embedding = self.component_config['embedding']

        r_text = message.text
        token = [x.text for x in message.get("tokens", [])]

        normalizer = self.normalizer
        lower = self.component_config["lower_case"]
        tag_schema = self.component_config["tag_schema"]

        text, words, tags = Processor()._convert_exam(text=r_text,
                                                      words=token,
                                                      ent_offsets=[],
                                                      tag_schema=tag_schema,
                                                      normalizer=normalizer,
                                                      lower=lower)

        if _embedding == "embedding":
            res, confidence = self._decode_embedding_entities(
                text, words, tags)
        elif _embedding == "bert":
            res, confidence = self._decode_bert_entities(text, words, tags)
        else:
            return

        # TODO: add confidence to result
        res = tagNormalizer(res).run()

        extracted = self.add_extractor_name(
            pred_result_to_json(r_text, res, confidence))

        message.set("entities",
                    message.get("entities", []) + extracted,
                    add_to_output=True)
Ejemplo n.º 8
0
    def _read_intent(self, intent_js, examples_js):
        """Reads the intent and examples from respective jsons."""
        intent = intent_js.get("name")

        training_examples = []
        for ex in examples_js:
            text, entities = self._join_text_chunks(ex['data'])
            training_examples.append(Message.build(text, intent, entities))

        return TrainingData(training_examples)
Ejemplo n.º 9
0
    def process(self, message: Message, **kwargs: Any) -> None:
        _intent = {"name": None, "confidence": 0.}

        if message.text.strip():
            intent = self.parser(message.text)

            if intent:
                message.set("intent", intent, add_to_output = True)

            elif message.get("intent"):
                pass

            else:
                message.set("intent", _intent, add_to_output = True)

        else:
            message.set("intent", _intent, add_to_output = True)
            message.set("intent_ranking", [], add_to_output = True)
Ejemplo n.º 10
0
    def process(self, message: Message, **kwargs) -> None:
        intent = {"name": None, "confidence": 0.}
        intent_ranking = []

        tokenizer = albert.load_pretrained_tokenizer(self.pre_path)

        packer = albert.DataIterPack(message = None,
                                     tokenizer = tokenizer,
                                     max_seq_len = self.max_seq_len,
                                     model = self.model,
                                     int2idx = self.int2idx,
                                     idx2int = self.idx2int,
                                     device = self.device)

        if message.text.strip():
            score, label = packer.decode(message.text, INTENT_RANKING_LENGTH)

            intent = {"name": label[0], "confidence": score[0]}

            intent_ranking = [{"name": x, "confidence": y} for x, y in zip(label[1:], score[1:])]

        message.set("intent", intent, add_to_output = True)
        message.set("intent_ranking", intent_ranking, add_to_output = True)
Ejemplo n.º 11
0
    def parse(self, text, time=None, only_output_properties=True):
        # type: (Text) -> Dict[Text, Any]
        """Parse the input text, classify it and return pipeline result.

        The pipeline result usually contains intent and entities."""

        if not text:
            # Not all components are able to handle empty strings. So we need
            # to prevent that... This default return will not contain all
            # output attributes of all components, but in the end, no one
            # should pass an empty string in the first place.
            output = self.default_output_attributes()
            output["text"] = ""
            return output

        message = Message(text, self.default_output_attributes(), time=time)

        for component in self.pipeline:
            component.process(message, **self.context)

        output = self.default_output_attributes()
        output.update(
            message.as_dict(only_output_properties=only_output_properties))
        return output
Ejemplo n.º 12
0
    def filter_trainable_entities(self, entity_examples):
        # type: (List[Message]) -> List[Message]
        """Filters out untrainable entity annotations.

        Creates a copy of entity_examples in which entities that have
        `extractor` set to something other than self.name (e.g. 'ner_crf')
        are removed."""

        filtered = []
        for message in entity_examples:
            entities = []
            for ent in message.get("entities", []):
                extractor = ent.get("extractor")
                if not extractor or extractor == self.name:
                    entities.append(ent)
            data = message.data.copy()
            data['entities'] = entities
            filtered.append(
                Message(text=message.text,
                        data=data,
                        output_properties=message.output_properties,
                        time=message.time))

        return filtered
Ejemplo n.º 13
0
    def read_from_json(self, js, **kwargs):
        # type: (Text, Any) -> TrainingData
        """Loads training data stored in the LUIS.ai data format."""

        training_examples = []
        regex_features = []

        # Simple check to ensure we support this luis data schema version
        if not js["luis_schema_version"].startswith("2"):
            raise Exception("Invalid luis data schema version {}, should be 2.x.x. "
                            "Make sure to use the latest luis version "
                            "(e.g. by downloading your data again)."
                            "".format(js["luis_schema_version"]))

        for r in js.get("regex_features", []):
            if r.get("activated", False):
                regex_features.append({"name": r.get("name"),
                                       "pattern": r.get("pattern")})

        for s in js["utterances"]:
            text = s.get("text")
            intent = s.get("intent")
            entities = []
            for e in s.get("entities") or []:
                start, end = e["startPos"], e["endPos"] + 1
                val = text[start:end]
                entities.append({"entity": e["entity"],
                                 "value": val,
                                 "start": start,
                                 "end": end})

            data = {"entities": entities}
            if intent:
                data["intent"] = intent
            training_examples.append(Message(text, data))
        return TrainingData(training_examples, regex_features=regex_features)
Ejemplo n.º 14
0
    def _convert_entity(self, example: Message):
        """get training entities for a example"""
        entities = example.get("entities", [])

        return [(ent["start"], ent["end"], ent["entity"]) for ent in entities]