def test_multiple_processor_runs(sentence): """ Test that running a preprocessor on a sentence a second time does not change the result """ assert utils.default_process(sentence) \ == utils.default_process(utils.default_process(sentence))
def partial_token_set_ratio(s1: str, s2: str, processor: Union[bool, Callable] = True, score_cutoff: float = 0) -> float: """ Compares the words in the strings based on unique and common words between them using fuzz.partial_ratio Args: s1 (str): first string to compare s2 (str): first string to compare processor (Union[bool, Callable]): optional callable that reformats the strings. utils.default_process is used by default, which lowercases the strings and trims whitespace score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. For ratio < score_cutoff 0 is returned instead. Defaults to 0. Returns: float: ratio between s1 and s2 as a float between 0 and 100 """ if callable(processor): s1 = processor(s1) s2 = processor(s2) elif processor: s1 = utils.default_process(s1) s2 = utils.default_process(s2) return rapidfuzz._fuzz.partial_token_set_ratio(s1, s1, score_cutoff=score_cutoff)
def quick_lev_ratio(s1: str, s2: str, processor: Union[bool, Callable] = True, score_cutoff: float = 0) -> float: """ Calculates a quick estimation of fuzz.ratio by counting uncommon letters between the two sentences. Guaranteed to be equal or higher than fuzz.ratio and can therefore be used to filter results before using fuzz.ratio Args: s1 (str): first string to compare s2 (str): first string to compare processor (Union[bool, Callable]): optional callable that reformats the strings. utils.default_process is used by default, which lowercases the strings and trims whitespace score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. For ratio < score_cutoff 0 is returned instead. Defaults to 0. Returns: float: ratio between s1 and s2 as a float between 0 and 100 """ if callable(processor): s1 = processor(s1) s2 = processor(s2) elif processor: s1 = utils.default_process(s1) s2 = utils.default_process(s2) return rapidfuzz._fuzz.quick_lev_ratio(s1, s2, score_cutoff=score_cutoff)
def partial_token_ratio(s1: str, s2: str, processor: Union[bool, Callable] = True, score_cutoff: float = 0) -> float: """ Helper method that returns the maximum of fuzz.partial_token_set_ratio and fuzz.partial_token_sort_ratio (faster than manually executing the two functions) Args: s1 (str): first string to compare s2 (str): first string to compare processor (Union[bool, Callable]): optional callable that reformats the strings. utils.default_process is used by default, which lowercases the strings and trims whitespace score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. For ratio < score_cutoff 0 is returned instead. Defaults to 0. Returns: float: ratio between s1 and s2 as a float between 0 and 100 """ if callable(processor): s1 = processor(s1) s2 = processor(s2) elif processor: s1 = utils.default_process(s1) s2 = utils.default_process(s2) return rapidfuzz._fuzz.partial_token_ratio(s1, s2, score_cutoff=score_cutoff)
def WRatio(s1: str, s2: str, processor: Union[bool, Callable] = True, score_cutoff: float = 0) -> float: """ Calculates a weighted ratio based on the other ratio algorithms Args: s1 (str): first string to compare s2 (str): first string to compare processor (Union[bool, Callable]): optional callable that reformats the strings. utils.default_process is used by default, which lowercases the strings and trims whitespace score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. For ratio < score_cutoff 0 is returned instead. Defaults to 0. Returns: float: ratio between s1 and s2 as a float between 0 and 100 """ if callable(processor): s1 = processor(s1) s2 = processor(s2) elif processor: s1 = utils.default_process(s1) s2 = utils.default_process(s2) return rapidfuzz._fuzz.WRatio(s1, s2, score_cutoff=score_cutoff)
def partial_ratio(s1: str, s2: str, processor: Union[bool, Callable] = False, score_cutoff: float = 0) -> float: """ calculates a partial ratio between two strings Args: s1 (str): first string to compare s2 (str): first string to compare processor (Union[bool, Callable]): optional callable that reformats the strings. utils.default_process is used by default, which lowercases the strings and trims whitespace score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. For ratio < score_cutoff 0 is returned instead. Defaults to 0. Returns: float: ratio between s1 and s2 as a float between 0 and 100 Example: >>> fuzz.partial_ratio("this is a test", "this is a test!") 100.0 """ if callable(processor): s1 = processor(s1) s2 = processor(s2) elif processor: s1 = utils.default_process(s1) s2 = utils.default_process(s2) return rapidfuzz._fuzz.partial_ratio(s1, s2, score_cutoff=score_cutoff)
def token_sort_ratio(s1: str, s2: str, processor: Union[bool, Callable] = True, score_cutoff: float = 0) -> float: """ sorts the words in the string and calculates the fuzz.ratio between them Args: s1 (str): first string to compare s2 (str): first string to compare processor (Union[bool, Callable]): optional callable that reformats the strings. utils.default_process is used by default, which lowercases the strings and trims whitespace score_cutoff (float): Optional argument for a score threshold as a float between 0 and 100. For ratio < score_cutoff 0 is returned instead. Defaults to 0. Returns: float: ratio between s1 and s2 as a float between 0 and 100 Example: >>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear") 100.0 """ if callable(processor): s1 = processor(s1) s2 = processor(s2) elif processor: s1 = utils.default_process(s1) s2 = utils.default_process(s2) return rapidfuzz._fuzz.token_sort_ratio(s1, s2, score_cutoff=score_cutoff)
def formatdb(db_file, output_file): #TODO: remove broken urls and data from file.. requires some work.. #TODO: make json valid, noticing extra commas at end of file from rapidfuzz import utils fh, abs_path = mkstemp() counter = 1 with os.fdopen(fh, 'wb') as new_file: with open(db_file) as old_file: # use readline() to read the first line line = old_file.readline() while line: #skip status if line.startswith('"success":'): line = old_file.readline() continue if line[-3:-1] == ' {': #name string = line[1:-5] #MAKE SURE DELIMITER IS STRIPPED (from searchModified.cpp): newstring = utils.default_process(string) oldline = line line = old_file.readline() if len(newstring) == 0: #replace with anything after / newstring = line.rstrip()[8:-3] newstring = unquote(newstring[newstring.rfind('/') + 1:]) if utils.default_process(newstring).strip() == "": newstring = "NONE" + str(counter) counter += 1 new_file.write(oldline.replace(string, newstring).encode()) if BASEURL in line: if line.startswith('\t'): # url - 1 url = line[9:-3] starting = line[:9] #skip file type, already in link old_file.readline() else: url = line[8:-3] starting = line[:8] #skip status # new_url = unquote(url).replace(BASEURL, '') #removed because unsafe chars might interfere with delimiter(see searchModified.cpp) new_url = url.replace(BASEURL, '') new_string = starting.encode() + new_url.encode( ) + line[-3:].encode() new_file.write(new_string) line = old_file.readline() continue new_file.write(line.encode()) line = old_file.readline() #Copy the file permissions from the old file to the new file copymode(db_file, abs_path) #Move new file move(abs_path, output_file)
def recognize( input_text: str, intent_graph: nx.DiGraph, examples_path: str, confidence: typing.Optional[typing.Callable[[int], int]] = 70, intent_filter: typing.Optional[typing.Callable[[str], bool]] = None, extra_converters: typing.Optional[typing.Dict[str, typing.Callable[ ..., typing.Any]]] = None, ) -> typing.List[Recognition]: """Find the closest matching intent(s). Default confidence 70 """ start_time = time.perf_counter() intent_filter = intent_filter or (lambda i: True) # Find closest match # pylint: disable=unpacking-non-sequence best_text, best_path, best_score, name_of_intent = extract_one_sqlite( fuzz_utils.default_process(input_text), examples_path) _LOGGER.debug("input=%s, match=%s, score=%s", input_text, best_text, best_score) end_time = time.perf_counter() _, recognition = rhasspynlu.fsticuffs.path_to_recognition( best_path, intent_graph, extra_converters=extra_converters) if best_score >= confidence: # assert recognition and recognition.intent, "Failed to find a match" recognition.intent.name = name_of_intent recognition.intent.confidence = best_score / 100.0 recognition.recognize_seconds = end_time - start_time recognition.raw_text = input_text recognition.raw_tokens = input_text.split() return [recognition]
async def command_not_found(self, string: str) -> "HelpQueryNotFound": """ Handles when a query does not match a valid command, group, cog or category. Will return an instance of the `HelpQueryNotFound` exception with the error message and possible matches. """ choices = list(await self.get_all_help_choices()) result = process.extract(default_process(string), choices, scorer=fuzz.ratio, score_cutoff=60, processor=None) return HelpQueryNotFound(f'Query "{string}" not found.', {choice[0]: choice[1] for choice in result})
def normalize_string(s: str) -> str: """Receives a string and apply transformations to normalize it making fuzzy match easier >>> normalize_string("São Paulo - Capital, SP") "sao paulo capital sp" """ s = default_process(s) normalized = ( unicodedata.normalize("NFKD", s).encode("ASCII", "ignore").decode() ) return " ".join(normalized.split())
def train(intent_graph: nx.DiGraph) -> ExamplesType: """Generate examples from intent graph.""" # Generate all possible intents _LOGGER.debug("Generating examples") examples: ExamplesType = defaultdict(dict) for intent_name, words, path in generate_examples(intent_graph): sentence = fuzz_utils.default_process(" ".join(words)) examples[intent_name][sentence] = path _LOGGER.debug("Examples generated") return examples
def test_fullProcess(self): mixed_strings = [ "Lorem Ipsum is simply dummy text of the printing and typesetting industry.", "C'est la vie", u"Ça va?", u"Cães danados", u"¬Camarões assados", u"a¬ሴ€耀", u"Á" ] mixed_strings_proc = [ "lorem ipsum is simply dummy text of the printing and typesetting industry", "c est la vie", u"ça va", u"cães danados", u"camarões assados", u"a ሴ 耀", u"á" ] for string, proc_string in zip(mixed_strings, mixed_strings_proc): self.assertEqual(utils.default_process(string), proc_string)
def process_fuzzy(merged_df, alias): """ Uses Fuzzy matching to check if any of the unmatched entities can be matched using either wiki-links or fuzzy matching. """ count = 0 alias['name'] = alias['name'].apply(str) processed_orgs = { row['parent']: utils.default_process(row['name']) for _, row in alias.iterrows() } all_wiki_links = alias.wikipedia.unique() # Fetch all the entities that haven't been matched for index, entity in merged_df[ merged_df["entity_ref_id"].isnull()].iterrows(): if count % 100 == 0: logging.info(count) count += 1 wikilink = entity.wiki # match wikilink to any other wikilink, and if it has a alias, match it to that O(1) if wikilink and wikilink in all_wiki_links: match = alias[alias['wikipedia'] == wikilink]['parent'].iloc[0] merged_df.loc[index, 'entity_ref_id'] = match merged_df.loc[index, 'score'] = -1 continue # try matching processed entity.name with all of the entities best_match = process.extractOne(entity.text, processed_orgs, processor=process_entity, scorer=fuzz.token_set_ratio, score_cutoff=70) # if match found and is of the same type if best_match and \ entity.label == alias[alias["parent"] == best_match[0]].iloc[0]["type"]: merged_df.loc[index, 'entity_ref_id'] = best_match[0] merged_df.loc[index, 'score'] = best_match[1] else: merged_df.loc[index, 'score'] = -2 return merged_df
def recognize( input_text: str, intent_graph: nx.DiGraph, examples: ExamplesType, intent_filter: typing.Optional[typing.Callable[[str], bool]] = None, extra_converters: typing.Optional[typing.Dict[str, typing.Callable[ ..., typing.Any]]] = None, ) -> typing.List[Recognition]: """Find the closest matching intent(s).""" start_time = time.perf_counter() intent_filter = intent_filter or (lambda i: True) choices: typing.Dict[str, typing.List[int]] = { text: path for intent_name, paths in examples.items() if intent_filter(intent_name) for text, path in paths.items() } # Find closest match # pylint: disable=unpacking-non-sequence best_text, best_score = fuzzy_process.extractOne( fuzz_utils.default_process(input_text), choices.keys(), processor=None) _LOGGER.debug("input=%s, match=%s, score=%s", input_text, best_text, best_score) best_path = choices[best_text] end_time = time.perf_counter() _, recognition = rhasspynlu.fsticuffs.path_to_recognition( best_path, intent_graph, extra_converters=extra_converters) assert recognition and recognition.intent, "Failed to find a match" recognition.intent.confidence = best_score / 100.0 recognition.recognize_seconds = end_time - start_time recognition.raw_text = input_text recognition.raw_tokens = input_text.split() return [recognition]
def processQuery(query: str) -> str: return default_process(" ".join(query.split())).replace(' ', ' ')
@pytest.mark.parametrize("scorer", scorers) def test_simple_unicode_tests(scorer): """ some very simple tests using unicode with scorers to catch relatively obvious implementation errors """ s1 = u"ÁÄ" s2 = "ABCD" assert scorer(s1, s2) == 0 assert scorer(s1, s1) == 100 @pytest.mark.parametrize( "processor", [True, utils.default_process, lambda s: utils.default_process(s)]) @pytest.mark.parametrize("scorer", scorers) def test_scorer_case_insensitive(processor, scorer): """ each scorer should be able to preprocess strings properly """ assert scorer(RatioTest.s1, RatioTest.s2, processor=processor) == 100 @pytest.mark.parametrize("processor", [False, None, lambda s: s]) def test_ratio_case_censitive(processor): assert fuzz.ratio(RatioTest.s1, RatioTest.s2, processor=processor) != 100 @pytest.mark.parametrize("scorer", scorers) def test_custom_processor(scorer):