Esempi in Python per SimpleTokenizer

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: recognizers_text.matcher.simple_tokenizer

Classe/tipologia: SimpleTokenizer

Esempi su hotexamples.com: 4

SimpleTokenizer in Python: 4 esempi trovati. Questi sono i migliori esempi reali in Python per recognizers_text.matcher.simple_tokenizer.SimpleTokenizer, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

SimpleTokenizer(4)

tokenize(3)

is_cjk(1)

Metodi utilizzati di frequente

SimpleTokenizer (4)

tokenize (3)

is_cjk (1)

Esempio n. 1

Mostra file

File: extractors.py Progetto: zhamppx97/Recognizers-Text

    def extract(self, source: str) -> List[ExtractResult]:
        result: List[ExtractResult] = list()
        if not self._pre_check_str(source):
            return result

        matched: List[bool] = [False] * len(source)

        match_source: Dict[Match, str] = dict()
        matches_list = list(
            map(
                lambda x: MatchesVal(matches=list(re.finditer(x.re, source)),
                                     val=x.val), self.regexes))
        matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list))

        for ml in matches_list:
            for m in ml.matches:
                for j in range(len(m.group())):
                    matched[m.start() + j] = True
                # Keep Source Data for extra information
                match_source[m] = ml.val
        last = -1

        for i in range(len(source)):
            if not matched[i]:
                last = i
            else:
                if i + 1 == len(source) or not matched[i + 1]:
                    start = last + 1
                    length = i - last
                    substring = source[start:start + length].strip()
                    simple_tokenizer = SimpleTokenizer()
                    if substring.startswith(Constants.IPV6_ELLIPSIS) and (
                            start > 0 and (str.isdigit(source[start - 1]) or
                                           (str.isalpha(source[start - 1])
                                            and not simple_tokenizer.is_cjk(
                                                c=list(source)[start - 1])))):
                        continue

                    elif substring.endswith(Constants.IPV6_ELLIPSIS) and (
                            i + 1 < len(source) and
                        (str.isdigit(source[i + 1]) or
                         (str.isalpha(source[i + 1]) and not simple_tokenizer.
                          is_cjk(c=list(source)[start - 1])))):
                        continue

                    src_match = next(
                        (x for x in iter(match_source)
                         if (x.start() == start and (x.end() -
                                                     x.start()) == length)),
                        None)

                    if src_match is not None:
                        value = ExtractResult()
                        value.start = start
                        value.length = length
                        value.text = substring
                        value.type = self._extract_type
                        value.data = match_source.get(src_match, None)
                        result.append(value)
        return result

Esempio n. 2

Mostra file

File: test_simple_tokenizer.py Progetto: mrussek/Recognizers-Text

    def test_mixed_tokenized():
        tokenizer = SimpleTokenizer()
        text = 'Hello，请给我1杯beer谢谢！'
        tokenized_text = tokenizer.tokenize(text)

        assert 'Hello' == tokenized_text[0].text
        assert 11 == len(tokenized_text)

Esempio n. 3

Mostra file

File: test_simple_tokenizer.py Progetto: mrussek/Recognizers-Text

    def test_english_tokenized():
        tokenizer = SimpleTokenizer()
        text = '   Hi, could     you give me a beer, please?'
        tokenized_text = tokenizer.tokenize(text)

        assert "Hi" == tokenized_text[0].text
        assert 11 == len(tokenized_text)

Esempio n. 4

Mostra file

File: test_simple_tokenizer.py Progetto: mrussek/Recognizers-Text

    def test_chinese_tokenized():
        tokenizer = SimpleTokenizer()
        text = '你好，请给我一杯啤酒！'
        tokenized_text = tokenizer.tokenize(text)

        assert "你" == tokenized_text[0].text
        assert 11 == len(tokenized_text)