def tokenize(self, text):
        from janome.tokenizer import Tokenizer
        import tinysegmenter
        import MeCab
        # type: (Text) -> List[Token]
        
        # Japanese tinysegmenter
        #tokenizer = tinysegmenter.TinySegmenter()
        #words = tokenizer.tokenize(text)
   
        # Japanese janome
        #tokenizer = Tokenizer()
        #words = tokenizer.tokenize(text, wakati = "True")
        #tokenized = [(word, text.index(word), text.index(word) + len(word)) for word in words]

        # Japanese Mecab
        m = MeCab.Tagger(" -d /usr/lib/mecab/dic/mecab-ipadic-neologd/")
        m.parse("")
        node = m.parseToNode(text)
        words = []
        while node:
             words.append(node.surface)
             node = node.next

        tokenized = [(word, text.find(word), text.find(word) + len(word)) for word in words]
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        
        return tokens
Beispiel #2
0
    def tokenize(self, text):
        # type: (Text) -> List[Token]
        import jieba

        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        return tokens
Beispiel #3
0
    def tokenize(self, text):
        # type: (Text) -> List[Token]

        # there is space or end of string after punctuation
        # because we do not want to replace 10.000 with 10 000
        words = re.sub(r'[.,!?]+(\s|$)', ' ', text).split()

        running_offset = 0
        tokens = []
        for word in words:
            print("Word: {}".format(word))
            word_offset = text.index(word, running_offset)
            word_len = len(word)
            running_offset = word_offset + word_len

            print("Ofset {}".format(word_offset))
            print("Word Length: {}".format(word_len))
            print("Running ofSet: {}".format(running_offset))

            tokens.append(Token(word, word_offset))
        #
        # print("Ofset {}".format(word_offset))
        # print("Word Length: {}".format(word_len))
        # print("Running ofSet: {}".format(running_offset))

        print("========================================")
        print(tokens)
        print(dir(tokens))

        return tokens
def test_count_vector_featurizer_using_tokens(tokens, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set("tokens", tokens_feature)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set("tokens", tokens_feature)

    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
Beispiel #5
0
 def tokenize(self, text: str) -> List[Token]:
     '''Tokenize the sentence.
     '''
     if self.user_dict_dir is not None:
         self.load_user_dictionary(self.user_dict_dir)
     tokenized = jieba.tokenize(text)
     tokens = [Token(word, start) for (word, start, end) in tokenized]
     return tokens
Beispiel #6
0
    def tokenize(self, text):
        # type: (Text) -> List[Token]
        tokenized = self.tokenizer.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        #for i in tokens:
        #    print(i.text)
        #    print('hhhh\n')

        return tokens
Beispiel #7
0
 def process(self, message, **kwargs):
     tokens_s = []
     for token in message.get("tokens"):
         if token.text in self.slangs:
             for subtoken in self.slangs[token.text].split(" "):
                 tokens_s.extend(Token(subtoken, 0))
         else:
             tokens_s.append(token)
     message.set("tokens_slangprocessed", tokens_s)
Beispiel #8
0
    def tokenize(self, text):
        #type: (Text) -> List[Token]

        self.dictionary = "mecabrc"
        self.tagger = MeCab.Tagger(self.dictionary)

        if not text:
            return []

        words = []
        if type(text) != str:
            text = u''.join((text)).encode('utf-8')
        node = self.tagger.parseToNode(str(text))

        running_offset = 0
        word_offset = 0

        while node:
            features = node.feature.split(',')
            if features[self.INDEX_CATEGORY] in self.TARGET_CATEGORIES:
                if features[self.INDEX_ROOT_FORM] == "*":
                    word_offset = text.index(node.surface, running_offset)
                    word_len = len(node.surface)
                    running_offset = word_offset + word_len
                    words.append(Token(node.surface, word_offset))
                else:
                    try:
                        word_offset = text.index(
                            features[self.INDEX_ROOT_FORM], running_offset)
                        word_len = len(features[self.INDEX_ROOT_FORM])
                        running_offset = word_offset + word_len
                        words.append(
                            Token(features[self.INDEX_ROOT_FORM], word_offset))
                    except ValueError:
                        print("No such a string")
                        if not word_offset:
                            word_offset = 0
                        word_len = 1
                        running_offset = word_offset + word_len

            node = node.next
        # for eachword in words:
        #     print('Word ==> {}'.format(eachword))
        return words
Beispiel #9
0
    def tokenize(self, text):
        # type: (Text) -> List[Token]
        import jieba

        if self.dictionary_path is not None:
            self.load_custom_dictionary(self.dictionary_path)

        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        return tokens
Beispiel #10
0
    def tokenize(self, text):
        # type: (Text) -> List[Token]

        tokens = []
        start = 0
        for word in self.seg.segment(text):
            tokens.append(Token(word, start))
            start += len(word)

        return tokens
Beispiel #11
0
    def tokenize_text(self, text):
        # type: (Text) -> List[Token_text]
        tokenized = self.tokenizer.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]
        token_text = []
        for i in tokens:
            token_text.append(i.text)
        #    print(i.text)
        #    print('hhhh\n')

        return token_text
Beispiel #12
0
 def process(self, message, **kwargs):
     from textblob import Word
     token_spellchecked=[]
     T = None
     for token in message.get("tokens_slangprocessed"):
         w = Word(token.text).correct()
         if len(w)>=1:
             a = np.array(w)
             print(str(token.text)+" corrected to "+str(a))
         token_spellchecked.append(Token(str(a),0))
     message.set("token_spellchecked", token_spellchecked)
    def tokenize(self, text):
        # type: (Text) -> List[Token]
        import mitie

        _text = text.encode('utf-8')
        tokenized = mitie.tokenize_with_offsets(_text)
        tokens = [
            Token(token.decode('utf-8'),
                  self._byte_to_char_offset(_text, offset))
            for token, offset in tokenized
        ]
        return tokens
Beispiel #14
0
    def tokenize(self, text):
        # type: (Text) -> List[Token]

        words = text.split()
        running_offset = 0
        tokens = []
        for word in words:
            word_offset = text.index(word, running_offset)
            word_len = len(word)
            running_offset = word_offset + word_len
            tokens.append(Token(word, word_offset))
        return tokens
Beispiel #15
0
    def tokenize(self, text):
        # type: (Text) -> List[Token]
        import jieba

        words = jieba.lcut(text.encode('utf-8'))
        running_offset = 0
        tokens = []
        for word in words:
            word_offset = text.index(word, running_offset)
            word_len = len(word)
            running_offset = word_offset + word_len
            tokens.append(Token(word, word_offset))
        return tokens
Beispiel #16
0
 def tokenize(self, text):
     # type: (Text) -> List[Token]
     from pyhanlp import HanLP
     terms = HanLP.segment(text)
     running_offset = 0
     tokens = []
     for term in terms:
         word_offset = text.index(term.word, running_offset)
         word_len = len(term.word)
         running_offset = word_offset + word_len
         tokens.append(Token(term.word, word_offset))
     logging.debug(terms)
     return tokens
Beispiel #17
0
    def tokenize(self, text):
        # type: (Text) -> List[Token]
        seg = self.Mycut(text)
        seg = self.add_userdict(seg)
        seg = self.split_userdict(seg)
        seg = seg.split('<>')

        tokens = []
        i = 0
        for w in seg:
            tokens.append(Token(w, i))
            i += len(w)
        return tokens
    def tokenize(self, text):
        # type: (Text) -> List[Token]
        # spacy_nlp = spacy.load('en_core_web_sm')
        # spacy_nlp.Defaults.stop_words |= {"no","not",""}
        # doc = spacy_nlp(text)
        # # print(words)
        # tok = [tokend for tokend in doc if not tokend.is_stop]

        # print(tok,'...............')
        # tokensd = nltk.tokenize.word_tokenize(text)
        # nltk_stopwords = nltk.corpus.stopwords.words('english')
        # tokensd = [tokend for tokend in tokensd if not tokend in nltk_stopwords]
        # print(tokensd,'----------------------')
        # there is space or end of string after punctuation
        # because we do not want to replace 10.000 with 10 000
        # print(t(text)
        words = re.sub(r'[.,!?]+(\s|$)', ' ', text).split()
        tokensd = []
        # for tokend in words:
        #     if not tokend.lower() in stop_words:
        #         if len(tokend.lower())>2:
        #             tokensd.append(tokend.lower())
        #         elif tokend.isnumeric():
        #             tokensd.append(tokend.lower())
        tokensd = [
            tokend.lower() for tokend in words
            if not tokend.lower() in stop_words
        ]
        doc = nlp(str(' '.join(tokensd)))
        words = [str(lemm.lemma_) for lemm in doc]
        words = [
            re.sub(
                r'[^\x00-\x7f]', '',
                re.sub('[\t\r\n,)([\]!%|!#$%&*+,.-/:;<=>?@^_`{|}~?]', '',
                       str(i))).strip() for i in words
        ]
        # print(tokensd)
        running_offset = 0
        tokens = []
        texts = ' '.join(words)
        for word in words:
            word_offset = texts.index(word, running_offset)
            word_len = len(word)
            running_offset = word_offset + word_len
            tokens.append(Token(word, word_offset))

        # print('//////////////////')
        # print(tokensd)
        # print('//////////////////')

        return tokens
    def tokenize(self, text):
        # type: (Text) -> List[Token]

        # words=self.parse_with_cabocha(text)
        words = self.parse_with_knp(text)
        running_offset = 0
        tokens = []
        for word in words:
            word_offset = text.index(word, running_offset)
            word_len = len(word)
            running_offset = word_offset + word_len
            tokens.append(Token(word, word_offset))
            # print(word, word_offset)
        return tokens
    def tokenize(self, text):
        # type: (Text) -> List[Token]

        token_list = self.mecab.morphs(text)

        running_offset = 0
        result = []
        for token in token_list:
            token_offset = text.index(token, running_offset)
            token_len = len(token)
            running_offset = token_offset + token_len
            result.append(Token(token, token_offset))

        return result
    def tokenize(self, text: Text) -> List[Token]:

        # there is space or end of string after punctuation
        # because we do not want to replace 10.000 with 10 000
        words = re.sub(r'[.,!?]+(\s|$)', ' ', text).split()

        running_offset = 0
        tokens = []
        for word in words:
            word_offset = text.index(word, running_offset)
            word_len = len(word)
            running_offset = word_offset + word_len
            tokens.append(Token(word, word_offset))
        return tokens
 def tokenize(self, text):
     """
     Tokenize a sentence and yields tuples of (word, start, end)
     type: (Text) -> List[Token]
     Parameter:
         - text: the str(unicode) to be segmented.
     """
     tokens = []
     start = 0
     for char in list(text):
         print(char)
         #yield (w, start, start + width)
         tokens.append(Token(char, start))
         start += 1
     return tokens
Beispiel #23
0
 def tokenize(self, text):
     words = []
     mecab_features = []
     node = self.mecab.parseToNode(text).next
     while node:
         words.append(node.surface)
         mecab_features.append(node.feature.split(','))
         node = node.next
     running_offset = 0
     tokens = []
     for word in words:
         word_offset = text.index(word, running_offset)
         running_offset = word_offset + len(word)
         tokens.append(Token(word, word_offset))
     return tokens, mecab_features
Beispiel #24
0
def tokenize_msg(text, msg_chunks):         
    words=[]
    for chunk in msg_chunks.chunks:        
        for token in chunk.tokens:
            # print(token, token.pos)
            words.append(token.surface)

    running_offset = 0
    tokens = []
    for word in words:
        word_offset = text.index(word, running_offset)
        word_len = len(word)
        running_offset = word_offset + word_len
        tokens.append(Token(word, word_offset))   
    return tokens
Beispiel #25
0
    def tokenize(self, text):
        from pyvi.pyvi import ViTokenizer
        from underthesea import word_sent
        # type: (Text) -> List[Token]
        # Vietnamese pyvi
        #tokenizer = ViTokenizer()
        #words = tokenizer.tokenize(text)

        # Vietnamese underthesea
        words = word_sent(text)
        tokenized = [(word, text.find(word), text.find(word) + len(word))
                     for word in words]
        tokens = [Token(word, start) for (word, start, end) in tokenized]

        return tokens
Beispiel #26
0
 def tokenize(self, text):
     """
     Tokenize a sentence and yields tuples of (word, start, end)
     type: (Text) -> List[Token]
     Parameter:
         - text: the str(unicode) to be segmented.
     """
     tokens = []
     tokenized = self.tokenizer.segment(text)
     print(tokenized)
     start = 0
     for term in tokenized:
         w = str(term).split('/')[0]
         width = len(w)
         #yield (w, start, start + width)
         tokens.append(Token(w, start))
         start += width
     return tokens
    def tokenize(self, text):
        # type: (Text) -> List[Token]

        words = text.split()
        running_offset = 0
        tokens = []
        print(words)
        for word in words:
            word_offset = text.index(word, running_offset)
            word_len = len(word)
            running_offset = word_offset + word_len
            if word in self.numbermap:
                word = self.numbermap[word]
                print("ny word ", word)

            print("ny word ", word)
            tokens.append(Token(word, word_offset))
        print("tokens ", tokens)
        return tokens
Beispiel #28
0
    def tokenize(self, doc: 'Doc') -> typing.List[Token]:

        return [Token(t.text, t.idx) for t in doc]
logging.basicConfig(level="DEBUG")


@pytest.fixture(scope="session")
def duckling_interpreter(component_builder, tmpdir_factory):
    conf = RasaNLUModelConfig({"pipeline": [{"name": "ner_duckling_http"}]})
    return utilities.interpreter_for(
        component_builder,
        data="./data/examples/rasa/demo-rasa.json",
        path=tmpdir_factory.mktemp("projects").strpath,
        config=conf)


# Chinese Example
# "对面食过敏" -> To be allergic to wheat-based food
CH_wrong_segmentation = [Token("对面", 0),
                         Token("食", 2),
                         Token("过敏", 3)]  # opposite, food, allergy
CH_correct_segmentation = [Token("对", 0),
                           Token("面食", 1),
                           Token("过敏",
                                 3)]  # towards, wheat-based food, allergy
CH_wrong_entity = {"start": 0, "end": 2, "value": "对面", "entity": "direction"}
CH_correct_entity = {
    "start": 1,
    "end": 3,
    "value": "面食",
    "entity": "food_type"
}

# EN example
Beispiel #30
0
    def tokenize(self, doc):
        # type: (Doc) -> List[Token]

        return [Token(t.text, t.idx) for t in doc]