def test_multiple_processor_runs(sentence):
    """
    Test that running a preprocessor on a sentence
    a second time does not change the result
    """
    assert utils.default_process(sentence) \
        == utils.default_process(utils.default_process(sentence))
Exemple #2
0
def partial_token_set_ratio(s1: str,
                            s2: str,
                            processor: Union[bool, Callable] = True,
                            score_cutoff: float = 0) -> float:
    """
    Compares the words in the strings based on unique and common words between them using fuzz.partial_ratio

    Args:
        s1 (str): first string to compare
        s2 (str): first string to compare
        processor (Union[bool, Callable]): optional callable that reformats the strings. utils.default_process
            is used by default, which lowercases the strings and trims whitespace
        score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
            For ratio < score_cutoff 0 is returned instead. Defaults to 0.

    Returns:
        float: ratio between s1 and s2 as a float between 0 and 100
    """

    if callable(processor):
        s1 = processor(s1)
        s2 = processor(s2)
    elif processor:
        s1 = utils.default_process(s1)
        s2 = utils.default_process(s2)

    return rapidfuzz._fuzz.partial_token_set_ratio(s1,
                                                   s1,
                                                   score_cutoff=score_cutoff)
Exemple #3
0
def quick_lev_ratio(s1: str,
                    s2: str,
                    processor: Union[bool, Callable] = True,
                    score_cutoff: float = 0) -> float:
    """
    Calculates a quick estimation of fuzz.ratio by counting uncommon letters between the two sentences.
    Guaranteed to be equal or higher than fuzz.ratio and can therefore be used to filter results before using fuzz.ratio

    Args:
        s1 (str): first string to compare
        s2 (str): first string to compare
        processor (Union[bool, Callable]): optional callable that reformats the strings. utils.default_process
            is used by default, which lowercases the strings and trims whitespace
        score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
            For ratio < score_cutoff 0 is returned instead. Defaults to 0.

    Returns:
        float: ratio between s1 and s2 as a float between 0 and 100

    """

    if callable(processor):
        s1 = processor(s1)
        s2 = processor(s2)
    elif processor:
        s1 = utils.default_process(s1)
        s2 = utils.default_process(s2)

    return rapidfuzz._fuzz.quick_lev_ratio(s1, s2, score_cutoff=score_cutoff)
Exemple #4
0
def partial_token_ratio(s1: str,
                        s2: str,
                        processor: Union[bool, Callable] = True,
                        score_cutoff: float = 0) -> float:
    """
    Helper method that returns the maximum of fuzz.partial_token_set_ratio and fuzz.partial_token_sort_ratio
        (faster than manually executing the two functions)

    Args:
        s1 (str): first string to compare
        s2 (str): first string to compare
        processor (Union[bool, Callable]): optional callable that reformats the strings. utils.default_process
            is used by default, which lowercases the strings and trims whitespace
        score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
            For ratio < score_cutoff 0 is returned instead. Defaults to 0.

    Returns:
        float: ratio between s1 and s2 as a float between 0 and 100
    """

    if callable(processor):
        s1 = processor(s1)
        s2 = processor(s2)
    elif processor:
        s1 = utils.default_process(s1)
        s2 = utils.default_process(s2)

    return rapidfuzz._fuzz.partial_token_ratio(s1,
                                               s2,
                                               score_cutoff=score_cutoff)
Exemple #5
0
def WRatio(s1: str,
           s2: str,
           processor: Union[bool, Callable] = True,
           score_cutoff: float = 0) -> float:
    """
    Calculates a weighted ratio based on the other ratio algorithms

    Args:
        s1 (str): first string to compare
        s2 (str): first string to compare
        processor (Union[bool, Callable]): optional callable that reformats the strings. utils.default_process
            is used by default, which lowercases the strings and trims whitespace
        score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
            For ratio < score_cutoff 0 is returned instead. Defaults to 0.

    Returns:
        float: ratio between s1 and s2 as a float between 0 and 100

    """

    if callable(processor):
        s1 = processor(s1)
        s2 = processor(s2)
    elif processor:
        s1 = utils.default_process(s1)
        s2 = utils.default_process(s2)

    return rapidfuzz._fuzz.WRatio(s1, s2, score_cutoff=score_cutoff)
Exemple #6
0
def partial_ratio(s1: str,
                  s2: str,
                  processor: Union[bool, Callable] = False,
                  score_cutoff: float = 0) -> float:
    """
    calculates a partial ratio between two strings

    Args:
        s1 (str): first string to compare
        s2 (str): first string to compare
        processor (Union[bool, Callable]): optional callable that reformats the strings. utils.default_process
            is used by default, which lowercases the strings and trims whitespace
        score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
            For ratio < score_cutoff 0 is returned instead. Defaults to 0.

    Returns:
        float: ratio between s1 and s2 as a float between 0 and 100

    Example:
        >>> fuzz.partial_ratio("this is a test", "this is a test!")
        100.0
    """

    if callable(processor):
        s1 = processor(s1)
        s2 = processor(s2)
    elif processor:
        s1 = utils.default_process(s1)
        s2 = utils.default_process(s2)

    return rapidfuzz._fuzz.partial_ratio(s1, s2, score_cutoff=score_cutoff)
Exemple #7
0
def token_sort_ratio(s1: str,
                     s2: str,
                     processor: Union[bool, Callable] = True,
                     score_cutoff: float = 0) -> float:
    """
    sorts the words in the string and calculates the fuzz.ratio between them

    Args:
        s1 (str): first string to compare
        s2 (str): first string to compare
        processor (Union[bool, Callable]): optional callable that reformats the strings. utils.default_process
            is used by default, which lowercases the strings and trims whitespace
        score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100.
            For ratio < score_cutoff 0 is returned instead. Defaults to 0.

    Returns:
        float: ratio between s1 and s2 as a float between 0 and 100

    Example:
        >>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
        100.0
    """

    if callable(processor):
        s1 = processor(s1)
        s2 = processor(s2)
    elif processor:
        s1 = utils.default_process(s1)
        s2 = utils.default_process(s2)

    return rapidfuzz._fuzz.token_sort_ratio(s1, s2, score_cutoff=score_cutoff)
def formatdb(db_file, output_file):
    #TODO: remove broken urls and data from file.. requires some work..
    #TODO: make json valid, noticing extra commas at end of file
    from rapidfuzz import utils
    fh, abs_path = mkstemp()
    counter = 1
    with os.fdopen(fh, 'wb') as new_file:
        with open(db_file) as old_file:
            # use readline() to read the first line
            line = old_file.readline()
            while line:
                #skip status
                if line.startswith('"success":'):
                    line = old_file.readline()
                    continue
                if line[-3:-1] == ' {':
                    #name
                    string = line[1:-5]
                    #MAKE SURE DELIMITER IS STRIPPED (from searchModified.cpp):
                    newstring = utils.default_process(string)
                    oldline = line
                    line = old_file.readline()
                    if len(newstring) == 0:  #replace with anything after /
                        newstring = line.rstrip()[8:-3]
                        newstring = unquote(newstring[newstring.rfind('/') +
                                                      1:])
                        if utils.default_process(newstring).strip() == "":
                            newstring = "NONE" + str(counter)
                            counter += 1
                    new_file.write(oldline.replace(string, newstring).encode())
                    if BASEURL in line:
                        if line.startswith('\t'):
                            # url - 1
                            url = line[9:-3]
                            starting = line[:9]
                            #skip file type, already in link
                            old_file.readline()
                        else:
                            url = line[8:-3]
                            starting = line[:8]
                            #skip status
                        # new_url = unquote(url).replace(BASEURL, '') #removed because unsafe chars might interfere with delimiter(see searchModified.cpp)
                        new_url = url.replace(BASEURL, '')
                        new_string = starting.encode() + new_url.encode(
                        ) + line[-3:].encode()
                        new_file.write(new_string)
                        line = old_file.readline()
                        continue
                new_file.write(line.encode())
                line = old_file.readline()
    #Copy the file permissions from the old file to the new file
    copymode(db_file, abs_path)
    #Move new file
    move(abs_path, output_file)
def recognize(
    input_text: str,
    intent_graph: nx.DiGraph,
    examples_path: str,
    confidence: typing.Optional[typing.Callable[[int], int]] = 70,
    intent_filter: typing.Optional[typing.Callable[[str], bool]] = None,
    extra_converters: typing.Optional[typing.Dict[str, typing.Callable[
        ..., typing.Any]]] = None,
) -> typing.List[Recognition]:
    """Find the closest matching intent(s). Default 	confidence 70				"""
    start_time = time.perf_counter()
    intent_filter = intent_filter or (lambda i: True)

    # Find closest match
    # pylint: disable=unpacking-non-sequence
    best_text, best_path, best_score, name_of_intent = extract_one_sqlite(
        fuzz_utils.default_process(input_text), examples_path)
    _LOGGER.debug("input=%s, match=%s, score=%s", input_text, best_text,
                  best_score)

    end_time = time.perf_counter()
    _, recognition = rhasspynlu.fsticuffs.path_to_recognition(
        best_path, intent_graph, extra_converters=extra_converters)

    if best_score >= confidence:
        # assert recognition and recognition.intent, "Failed to find a match"
        recognition.intent.name = name_of_intent
        recognition.intent.confidence = best_score / 100.0
        recognition.recognize_seconds = end_time - start_time
        recognition.raw_text = input_text
        recognition.raw_tokens = input_text.split()
        return [recognition]
Exemple #10
0
    async def command_not_found(self, string: str) -> "HelpQueryNotFound":
        """
        Handles when a query does not match a valid command, group, cog or category.

        Will return an instance of the `HelpQueryNotFound` exception with the error message and possible matches.
        """
        choices = list(await self.get_all_help_choices())
        result = process.extract(default_process(string), choices, scorer=fuzz.ratio, score_cutoff=60, processor=None)
        return HelpQueryNotFound(f'Query "{string}" not found.', {choice[0]: choice[1] for choice in result})
def normalize_string(s: str) -> str:
    """Receives a string and apply transformations to normalize it making
    fuzzy match easier
    >>> normalize_string("São Paulo - Capital, SP")
    "sao paulo capital sp"
    """
    s = default_process(s)
    normalized = (
        unicodedata.normalize("NFKD", s).encode("ASCII", "ignore").decode()
    )
    return " ".join(normalized.split())
Exemple #12
0
def train(intent_graph: nx.DiGraph) -> ExamplesType:
    """Generate examples from intent graph."""

    # Generate all possible intents
    _LOGGER.debug("Generating examples")
    examples: ExamplesType = defaultdict(dict)
    for intent_name, words, path in generate_examples(intent_graph):
        sentence = fuzz_utils.default_process(" ".join(words))
        examples[intent_name][sentence] = path

    _LOGGER.debug("Examples generated")

    return examples
Exemple #13
0
    def test_fullProcess(self):
        mixed_strings = [
            "Lorem Ipsum is simply dummy text of the printing and typesetting industry.",
            "C'est la vie", u"Ça va?", u"Cães danados", u"¬Camarões assados",
            u"a¬ሴ€耀", u"Á"
        ]
        mixed_strings_proc = [
            "lorem ipsum is simply dummy text of the printing and typesetting industry",
            "c est la vie", u"ça va", u"cães danados", u"camarões assados",
            u"a ሴ 耀", u"á"
        ]

        for string, proc_string in zip(mixed_strings, mixed_strings_proc):
            self.assertEqual(utils.default_process(string), proc_string)
Exemple #14
0
def process_fuzzy(merged_df, alias):
    """
    Uses Fuzzy matching to check if any of the unmatched entities
    can be matched using either wiki-links or fuzzy matching.
    """

    count = 0
    alias['name'] = alias['name'].apply(str)
    processed_orgs = {
        row['parent']: utils.default_process(row['name'])
        for _, row in alias.iterrows()
    }

    all_wiki_links = alias.wikipedia.unique()

    # Fetch all the entities that haven't been matched
    for index, entity in merged_df[
            merged_df["entity_ref_id"].isnull()].iterrows():

        if count % 100 == 0:
            logging.info(count)
        count += 1

        wikilink = entity.wiki

        # match wikilink to any other wikilink, and if it has a alias, match it to that O(1)
        if wikilink and wikilink in all_wiki_links:
            match = alias[alias['wikipedia'] == wikilink]['parent'].iloc[0]
            merged_df.loc[index, 'entity_ref_id'] = match
            merged_df.loc[index, 'score'] = -1
            continue

        # try matching processed entity.name with all of the entities
        best_match = process.extractOne(entity.text,
                                        processed_orgs,
                                        processor=process_entity,
                                        scorer=fuzz.token_set_ratio,
                                        score_cutoff=70)

        # if match found and is of the same type
        if best_match and \
                entity.label == alias[alias["parent"] == best_match[0]].iloc[0]["type"]:
            merged_df.loc[index, 'entity_ref_id'] = best_match[0]
            merged_df.loc[index, 'score'] = best_match[1]
        else:
            merged_df.loc[index, 'score'] = -2
    return merged_df
Exemple #15
0
def recognize(
    input_text: str,
    intent_graph: nx.DiGraph,
    examples: ExamplesType,
    intent_filter: typing.Optional[typing.Callable[[str], bool]] = None,
    extra_converters: typing.Optional[typing.Dict[str, typing.Callable[
        ..., typing.Any]]] = None,
) -> typing.List[Recognition]:
    """Find the closest matching intent(s)."""
    start_time = time.perf_counter()
    intent_filter = intent_filter or (lambda i: True)
    choices: typing.Dict[str, typing.List[int]] = {
        text: path
        for intent_name, paths in examples.items()
        if intent_filter(intent_name) for text, path in paths.items()
    }

    # Find closest match
    # pylint: disable=unpacking-non-sequence
    best_text, best_score = fuzzy_process.extractOne(
        fuzz_utils.default_process(input_text), choices.keys(), processor=None)
    _LOGGER.debug("input=%s, match=%s, score=%s", input_text, best_text,
                  best_score)
    best_path = choices[best_text]

    end_time = time.perf_counter()
    _, recognition = rhasspynlu.fsticuffs.path_to_recognition(
        best_path, intent_graph, extra_converters=extra_converters)

    assert recognition and recognition.intent, "Failed to find a match"
    recognition.intent.confidence = best_score / 100.0
    recognition.recognize_seconds = end_time - start_time
    recognition.raw_text = input_text
    recognition.raw_tokens = input_text.split()

    return [recognition]
Exemple #16
0
def processQuery(query: str) -> str:
    return default_process(" ".join(query.split())).replace('  ', ' ')
Exemple #17
0
@pytest.mark.parametrize("scorer", scorers)
def test_simple_unicode_tests(scorer):
    """
    some very simple tests using unicode with scorers
    to catch relatively obvious implementation errors
    """
    s1 = u"ÁÄ"
    s2 = "ABCD"
    assert scorer(s1, s2) == 0
    assert scorer(s1, s1) == 100


@pytest.mark.parametrize(
    "processor",
    [True, utils.default_process, lambda s: utils.default_process(s)])
@pytest.mark.parametrize("scorer", scorers)
def test_scorer_case_insensitive(processor, scorer):
    """
    each scorer should be able to preprocess strings properly
    """
    assert scorer(RatioTest.s1, RatioTest.s2, processor=processor) == 100


@pytest.mark.parametrize("processor", [False, None, lambda s: s])
def test_ratio_case_censitive(processor):
    assert fuzz.ratio(RatioTest.s1, RatioTest.s2, processor=processor) != 100


@pytest.mark.parametrize("scorer", scorers)
def test_custom_processor(scorer):