Ejemplo n.º 1
0
 def tokenize(self, text: Text) -> typing.List[Token]:
     if self.third_party_service_endpoint is not None:
         req = requests.post(self.third_party_service_endpoint, data={"text": text})
         return [Token(v["text"], v["end"]) for v in req.json()]
     else:
         logger.warning(
             "Third party tokenizer component in pipeline, but no "
             "`third_party_service_endpoint` configuration in the config."
         )
         return [Token(text, 0)]
Ejemplo n.º 2
0
 def tokenize(self, text: Text) -> typing.List[Token]:
     if self.third_party_service_endpoint is not None:
         headers = {'Content-type': 'application/json', 'Accept': 'application/json'}
         req = requests.post(self.third_party_service_endpoint, data=json.dumps({"text": text}), headers=headers)
         return [Token(v["text"], v["end"]) for v in req.json()]
     else:
         logger.warning(
             "Third party tokenizer component in pipeline, but no "
             "`third_party_service_endpoint` configuration in the config."
         )
         return [Token(text, 0)]
Ejemplo n.º 3
0
def test_tokens_comparison():
    from rasa.nlu.tokenizers import Token

    x = Token("hello", 0)
    y = Token("Hello", 0)

    assert x == x
    assert y < x

    assert x != 1

    with pytest.raises(TypeError):
        assert y < "a"
Ejemplo n.º 4
0
    def tokenize(self,
                 text: Text,
                 attribute: Text = MESSAGE_TEXT_ATTRIBUTE) -> List[Token]:

        if not self.case_sensitive:
            text = text.lower()
        # remove 'not a word character' if
        if attribute != MESSAGE_INTENT_ATTRIBUTE:
            words = re.sub(
                # there is a space or an end of a string after it
                r"[^\w#@&]+(?=\s|$)|"
                # there is a space or beginning of a string before it
                # not followed by a number
                r"(\s|^)[^\w#@&]+(?=[^0-9\s])|"
                # not in between numbers and not . or @ or & or - or #
                # e.g. 10'000.00 or [email protected]
                # and not url characters
                r"(?<=[^0-9\s])[^\w._~:/?#\[\]()@!$&*+,;=-]+(?=[^0-9\s])",
                " ",
                text,
            ).split()
        else:
            words = (text.split(self.intent_split_symbol)
                     if self.intent_tokenization_flag else [text])

        running_offset = 0
        tokens = []
        for word in words:
            word_offset = text.index(word, running_offset)
            word_len = len(word)
            running_offset = word_offset + word_len
            tokens.append(Token(word, word_offset))
        return tokens
Ejemplo n.º 5
0
def test_count_vector_featurizer_using_tokens(tokens, expected):
    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set("tokens", tokens_feature)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set("tokens", tokens_feature)

    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
Ejemplo n.º 6
0
    def tokenize(self, text: Text, attribute=MESSAGE_TEXT_ATTRIBUTE) -> List[Token]:
        import jieba

        text = self.preprocess_text(text, attribute)
        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        return tokens
Ejemplo n.º 7
0
    def tokenize(self, text: Text) -> List[Token]:
        import MicroTokenizer

        tokenized = MicroTokenizer.cut(text, **self.kwargs)

        tokens = []
        offset = 0
        for word in tokenized:
            tokens.append(Token(word, offset))
            offset += len(word)

        return tokens
Ejemplo n.º 8
0
    def tokenize(text: Text) -> List[Token]:
        def mecabsplit(mecab_tagger, inputs, pos):
            r = []
            inputs = mecab_tagger.parse(inputs)
            t = inputs.split('\n')[:-2]
            for i in t:
                field = i.split('\t')
                if field[1].split(',')[-1] is not '*':
                    r.extend([(x.split('/')[0], x.split('/')[1])
                              for x in field[1].split(',')[-1].split('+')])
                else:
                    r.append((field[0], field[1].split(',')[0]))
            if pos:
                return r
            else:
                return [x[0] for x in r]
            return r

        mecab_tagger = MeCab.Tagger()

        a = mecab_tagger.parse(text)
        t = a.split('\n')[:-2]
        tokenpointer = []
        pointeroffset = 0

        for i in t:
            field = i.split('\t')
            if field[1].split(',')[-1] is not '*':
                currentptr = text.index(field[0], pointeroffset)
                for x in field[1].split(',')[-1].split('+'):
                    try:
                        w = x.split('/')[0]
                        temp = field[0].index(w)
                        tokenpointer.append(
                            (currentptr + temp, currentptr + temp + len(w)))
                    except:
                        tokenpointer.append(
                            (currentptr, currentptr + len(field[0])))
                pointeroffset = currentptr + len(field[0])
            else:
                currentptr = text.index(field[0], pointeroffset)
                tokenpointer.append((currentptr, currentptr + len(field[0])))
                pointeroffset = currentptr + len(field[0])
        words = mecabsplit(mecab_tagger, text, False)
        tokens = []
        offset = 0
        for word in words:
            word_offset = tokenpointer[words.index(word, offset)][0]
            tokens.append(Token(word, word_offset))
            offset += 1

        return tokens
Ejemplo n.º 9
0
    def tokenize(self, text: Text) -> List[Token]:

        # there is space or end of string after punctuation
        # because we do not want to replace 10.000 with 10 000
        words = re.sub(r'[.,!?]+(\s|$)', ' ', text).split()

        running_offset = 0
        tokens = []
        for word in words:
            word_offset = text.index(word, running_offset)
            word_len = len(word)
            running_offset = word_offset + word_len
            tokens.append(Token(word, word_offset))
        return tokens
Ejemplo n.º 10
0
    def tokenize(text: Text) -> List[Token]:

        mt = MeCab.Tagger("-d /usr/local/lib/mecab/dic/mecab-ko-dic")
        parsed = mt.parse(text)
        x = parsed.replace("\n", "\t").split("\t")
        words = []
        for i in range(0, len(x) - 2, 2):
            w = x[i]
            words.append(w)

        running_offset = 0
        tokens = []
        for word in words:
            word_offset = text.index(word, running_offset)
            word_len = len(word)
            running_offset = word_offset + word_len
            tokens.append(Token(word, word_offset))
        return tokens
Ejemplo n.º 11
0
    def tokenize(text: Text) -> List[Token]:

        mt = MeCab.Tagger()
        parsed = mt.parse(text)
        x = (parsed.replace('\n', '\t').split('\t'))
        words = []
        for i in range(0, len(x)-2, 2):
            w = x[i]
            words.append(w)

        running_offset = 0
        tokens = []
        for word in words:
            word_offset = text.index(word, running_offset)
            word_len = len(word)
            running_offset = word_offset + word_len
            tokens.append(Token(word, word_offset))
        return tokens
Ejemplo n.º 12
0
    def tokenize(text: Text) -> List[Token]:
        # remove 'not a word character' if
        words = re.sub(
            # there is a space or an end of a string after it
            r'[^\w#@&]+(?=\s|$)|'
            # there is a space or beginning of a string before it
            # not followed by a number
            r'(\s|^)[^\w#@&]+(?=[^0-9\s])|'
            # not in between numbers and not . or @ or & or - or #
            # e.g. 10'000.00 or [email protected]
            # and not url characters
            r'(?<=[^0-9\s])[^\w._~:/?#\[\]()@!$&*+,;=-]+(?=[^0-9\s])',
            ' ',
            text).split()

        running_offset = 0
        tokens = []
        for word in words:
            word_offset = text.index(word, running_offset)
            word_len = len(word)
            running_offset = word_offset + word_len
            tokens.append(Token(word, word_offset))
        return tokens
Ejemplo n.º 13
0
    def tokenize(self, doc: "Doc") -> typing.List[Token]:

        return [Token(t.text, t.idx) for t in doc]
Ejemplo n.º 14
0
 def _token_from_offset(self, text: bytes, offset: int,
                        encoded_sentence: bytes) -> Token:
     return Token(
         text.decode(DEFAULT_ENCODING),
         self._byte_to_char_offset(encoded_sentence, offset),
     )
Ejemplo n.º 15
0
    def process(self, message: 'Message', **kwargs: Any) -> None:
        """Process an incoming message.

        This is the components chance to process an incoming
        message. The component can rely on
        any context attribute to be present, that gets created
        by a call to :meth:`components.Component.pipeline_init`
        of ANY component and
        on any context attributes created by a call to
        :meth:`components.Component.process`
        of components previous to this one."""

        self.sium.set_context(self.context)

        # TODO: lowercase IU

        # The Latest IU is being appended to
        # "iu_list" in the message,
        # so we grab last one out of that.
        iu_list = message.get("iu_list")
        new_iu = iu_list[-1]
        # Extract into tuple of (word, type)
        # where type is either an "add" or "revoke".
        iu_word, iu_type = new_iu
        # If it's an add, we have to update our intents
        # and extract any entities if they meet our threshold.
        # We also have to keep track of our word offset for
        # the entities message.
        if iu_type == "add":
            self.tokens.append(Token(iu_word, self.word_offset))
            props, prop_dist = self.sium.add_word_increment({"word": iu_word})
            for p in props:
                # if we have a confidence of 0.5, then
                # add that entity
                if prop_dist.prob(p) > 0.5:
                    self.extracted_entities.append({
                        'start':
                        self.word_offset,
                        'end':
                        self.word_offset + len(iu_word) - 1,
                        'value':
                        iu_word,
                        'entity':
                        p,
                        'confidence':
                        prop_dist.prob(p),
                        'extractor':
                        'rasa_sium'
                    })
            self.word_offset += len(iu_word)
        elif iu_type == "revoke":
            # Need to undo everything above, remove tokens,
            # revoke word, remove extracted entities, subtract word_offset.
            self.word_offset -= len(iu_word)
            # Remove our latest token from our list.
            self.tokens.pop()
            # This is a bit more difficult, basically, if we have
            # our word show up in any extracted entities, then we
            # need to remove that entity from our list of entities.
            if self.extracted_entities:
                last_entity = self.extracted_entities[-1]
                if iu_word in last_entity.values():
                    self.extracted_entities.pop()
            self.sium.revoke()
        else:
            logger.error("incompatible iu type, expected 'add' or 'revoke',"
                         " got '" + iu_type + "'")
        pred_intent, intent_ranks = self.__get_intents_and_ranks()
        message.set("intent", pred_intent, add_to_output=True)
        message.set("intent_ranking", intent_ranks)
        message.set("tokens", self.tokens)
        message.set("entities", self.extracted_entities, add_to_output=True)
Ejemplo n.º 16
0
                "name": "DucklingHTTPExtractor"
            },
        ]
    })
    return utilities.interpreter_for(
        component_builder,
        data="./data/examples/rasa/demo-rasa.json",
        path=tmpdir_factory.mktemp("projects").strpath,
        config=conf,
    )


# Chinese Example
# "对面食过敏" -> To be allergic to wheat-based food
CH_wrong_segmentation = [
    Token("对面", 0),
    Token("食", 2),
    Token("过敏", 3),  # opposite, food, allergy
]
CH_correct_segmentation = [
    Token("对", 0),
    Token("面食", 1),
    Token("过敏", 3),  # towards, wheat-based food, allergy
]
CH_wrong_entity = {"start": 0, "end": 2, "value": "对面", "entity": "direction"}
CH_correct_entity = {
    "start": 1,
    "end": 3,
    "value": "面食",
    "entity": "food_type"
}
Ejemplo n.º 17
0
    def tokenize(text: Text) -> List[Token]:
        import jieba

        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        return tokens
Ejemplo n.º 18
0
 def _token_from_offset(self, text, offset, encoded_sentence):
     return Token(text.decode("utf-8"),
                  self._byte_to_char_offset(encoded_sentence, offset))
    def tokenize(self, doc: 'Doc') -> typing.List[Token]:

        return [Token(t.lemma_, t.idx) for t in doc]