def match_value(self, source: List[str], match: List[str], start_pos: int) -> float: matched = 0 total_deviation = 0 for match_token in match: pos = StringUtility.index_of(source, match_token, start_pos) if pos >= 0: distance = pos - start_pos if matched > 0 else 0 if distance <= self.config.max_distance: matched = matched + 1 total_deviation = total_deviation + distance start_pos = pos + 1 score = 0.0 if matched > 0 and (matched == len(match) or self.config.allow_partial_match): completeness = matched / len(match) accuracy = completeness * (matched / (matched + total_deviation)) initial_score = accuracy * (matched / len(source)) score = 0.4 + (0.6 * initial_score) return score
def __tokenize(self, source: str) -> List[str]: tokens = [] chars = slice(source) token: str = '' pattern = regex.compile(self.config.token_regex) for char in chars: if StringUtility.is_emoji(char): tokens.append(char) if not (token is None or token.strip() == ''): tokens.append(token) token = '' elif not (pattern.search(char) is not None or chars.strip() == ''): token = token + char elif token != '' or token.strip() != '': tokens.append(token) token = '' if token != '' or token.strip() != '': tokens.append(token) token = '' return tokens
def __get_matches(regexp: Pattern, source: str) -> []: py_regex = StringUtility.remove_unicode_matches(regexp) matches = list(regex.finditer(py_regex, source)) return list(filter(None, map(lambda m: m.group().lower(), matches)))