Esempio n. 1
0
    def extract(self, source: str) -> List[ExtractResult]:
        result: List[ExtractResult] = list()
        if not self._pre_check_str(source):
            return result

        matched: List[bool] = [False] * len(source)

        match_source: Dict[Match, str] = dict()
        matches_list = list(
            map(
                lambda x: MatchesVal(matches=list(re.finditer(x.re, source)),
                                     val=x.val), self.regexes))
        matches_list = list(filter(lambda x: len(x.matches) > 0, matches_list))

        for ml in matches_list:
            for m in ml.matches:
                for j in range(len(m.group())):
                    matched[m.start() + j] = True
                # Keep Source Data for extra information
                match_source[m] = ml.val
        last = -1

        for i in range(len(source)):
            if not matched[i]:
                last = i
            else:
                if i + 1 == len(source) or not matched[i + 1]:
                    start = last + 1
                    length = i - last
                    substring = source[start:start + length].strip()
                    simple_tokenizer = SimpleTokenizer()
                    if substring.startswith(Constants.IPV6_ELLIPSIS) and (
                            start > 0 and (str.isdigit(source[start - 1]) or
                                           (str.isalpha(source[start - 1])
                                            and not simple_tokenizer.is_cjk(
                                                c=list(source)[start - 1])))):
                        continue

                    elif substring.endswith(Constants.IPV6_ELLIPSIS) and (
                            i + 1 < len(source) and
                        (str.isdigit(source[i + 1]) or
                         (str.isalpha(source[i + 1]) and not simple_tokenizer.
                          is_cjk(c=list(source)[start - 1])))):
                        continue

                    src_match = next(
                        (x for x in iter(match_source)
                         if (x.start() == start and (x.end() -
                                                     x.start()) == length)),
                        None)

                    if src_match is not None:
                        value = ExtractResult()
                        value.start = start
                        value.length = length
                        value.text = substring
                        value.type = self._extract_type
                        value.data = match_source.get(src_match, None)
                        result.append(value)
        return result
    def test_mixed_tokenized():
        tokenizer = SimpleTokenizer()
        text = 'Hello,请给我1杯beer谢谢!'
        tokenized_text = tokenizer.tokenize(text)

        assert 'Hello' == tokenized_text[0].text
        assert 11 == len(tokenized_text)
    def test_english_tokenized():
        tokenizer = SimpleTokenizer()
        text = '   Hi, could     you give me a beer, please?'
        tokenized_text = tokenizer.tokenize(text)

        assert "Hi" == tokenized_text[0].text
        assert 11 == len(tokenized_text)
    def test_chinese_tokenized():
        tokenizer = SimpleTokenizer()
        text = '你好,请给我一杯啤酒!'
        tokenized_text = tokenizer.tokenize(text)

        assert "你" == tokenized_text[0].text
        assert 11 == len(tokenized_text)