Esempio n. 1
0
    def match_value(self, source: List[str], match: List[str], start_pos: int) -> float:
        matched = 0
        total_deviation = 0
        for match_token in match:
            pos = StringUtility.index_of(source, match_token, start_pos)
  
            if pos >= 0:
                distance = pos - start_pos if matched > 0 else 0

                if distance <= self.config.max_distance:
                    matched = matched + 1
                    total_deviation = total_deviation + distance
                    start_pos = pos + 1

        score = 0.0

        if matched > 0 and (matched == len(match) or self.config.allow_partial_match):
            completeness = matched / len(match)
            accuracy = completeness * (matched / (matched + total_deviation))
            initial_score = accuracy * (matched / len(source))
            score = 0.4 + (0.6 * initial_score)
        return score
Esempio n. 2
0
    def __tokenize(self, source: str) -> List[str]:
        tokens = []
        chars = slice(source)

        token: str = ''
        pattern = regex.compile(self.config.token_regex)
        for char in chars:
            if StringUtility.is_emoji(char):
                tokens.append(char)
                if not (token is None or token.strip() == ''):
                    tokens.append(token)
                    token = ''
            elif not (pattern.search(char) is not None or chars.strip() == ''):
                token = token + char
            elif token != '' or token.strip() != '':
                tokens.append(token)
                token = ''

        if token != '' or token.strip() != '':
            tokens.append(token)
            token = ''

        return tokens
 def __get_matches(regexp: Pattern, source: str) -> []:
     py_regex = StringUtility.remove_unicode_matches(regexp)
     matches = list(regex.finditer(py_regex, source))
     return list(filter(None, map(lambda m: m.group().lower(), matches)))