def match_license(name: str) -> dict: """Match if the given license name matches any license present on spdx.org :param name: License name :return: Information of the license matched """ all_licenses = get_all_licenses_from_spdx() name = re.sub(r"\s+license\s*", "", name.strip(), flags=re.IGNORECASE) best_matches = process.extract(name, _get_all_license_choice(all_licenses)) spdx_license = best_matches[0] if spdx_license[1] != 100: best_matches = [ lic[0] for lic in best_matches if not lic[0].endswith("-only") ] if best_matches: best_matches = process.extract(name, best_matches, scorer=token_set_ratio) spdx_license = best_matches[0] best_matches = [ lic[0] for lic in best_matches if lic[1] >= spdx_license[1] ] if len(best_matches) > 1: spdx_license = process.extractOne(name, best_matches, scorer=token_sort_ratio) log.info(f"Best match for license {name} was {spdx_license}.\n" f"Best matches: {best_matches}") return _get_license(spdx_license[0], all_licenses)
async def convert(self, ctx: commands.Context, argument: str) -> List[discord.Member]: bot = ctx.bot match = self._get_id_match(argument) or re.match(r"<@!?([0-9]+)>$", argument) guild = ctx.guild result = [] if match is None: # Not a mention if guild: for m in process.extract( argument, {m: unidecode(m.name) for m in guild.members}, limit=None, score_cutoff=75, ): result.append(m[2]) for m in process.extract( argument, {m: unidecode(m.nick) for m in guild.members if m.nick and m not in result}, limit=None, score_cutoff=75, ): result.append(m[2]) else: user_id = int(match.group(1)) if guild: result.append(guild.get_member(user_id)) else: result.append(_get_from_guilds(bot, "get_member", user_id)) if not result or result == [None]: raise BadArgument('Member "{}" not found'.format(argument)) return result
def search_audiobooks(self, since=None, author=None, title=None, tag=None, limit=25): """ Args: since: a UNIX timestamp; returns all projects cataloged since that time author: all records by that author last name title: all matching titles tag: all projects of the matching tag limit: max entries to return (int) Returns: list : list of AudioBook objects """ # priority for title matches alll = self.get_all_audiobooks() if title: for res in process.extract(title, alll, limit=limit): match, score = res yield match alll.remove(match) # second author matches if author: choices = [" ".join([str(a) for a in b.authors]) for b in alll] for res in process.extract(author, choices, limit=limit): match, score = res match = alll[choices.index(match)] yield match alll.remove(match)
def fuzzy_match(query, mapped_choices, limit=10, score_cutoff=88): best_matches = [ {"match": match, "score": score, "result": result} for (match, score, result) in process.extract(query, mapped_choices, limit=limit, score_cutoff=score_cutoff) ] return best_matches or [ {"match": match, "score": score, "result": result} for (match, score, result) in process.extract(query, mapped_choices, limit=limit) ]
async def convert(self, ctx: commands.Context, argument: str) -> List[discord.Guild]: bot = ctx.bot match = self._get_id_match(argument) result = [] if not await bot.is_owner(ctx.author): # Don't need to be snooping other guilds unless we're # the bot owner raise BadArgument(_("That option is only available for the bot owner.")) if not match: # Not a mention for g in process.extract( argument, {g: unidecode(g.name) for g in bot.guilds}, limit=None, score_cutoff=75 ): result.append(g[2]) else: guild_id = int(match.group(1)) guild = bot.get_guild(guild_id) if not guild: raise BadArgument('Guild "{}" not found'.format(argument)) result.append(guild) if not result: raise BadArgument('Guild "{}" not found'.format(argument)) return result
async def get_fuzzy_member(self, ctx, name): user = discord.utils.get(ctx.guild.members, name=name) if user: await self.config.member(user).storedname.set(name) return user all_users = await self.config.all_members(ctx.guild) for user, data in all_users.items(): user = ctx.guild.get_member(int(user)) if not user: continue if data["storedname"] == name: return user result = [] for r in process.extract( name, {m: self.decode_cancer_name(m.name) for m in ctx.guild.members}, limit=None, score_cutoff=75, ): result.append((r[2], r[1])) sorted_result = sorted(result, key=lambda r: r[1], reverse=True) await self.config.member(sorted_result[0][0]).storedname.set(name) return sorted_result[0][0]
def search_docs(self, query, k=8, make_default=False): query = query.strip().lower() if not query: return choices(self._docs_names, k=k) if make_default else None # further fuzzy search it using rapidfuzz ratio matching fuzzed = process.extract( query=query, choices=self._docs_names, scorer=fuzz.ratio, processor=None, limit=max(k, 8), ) tweak = list() for idx, (name, score, junk) in enumerate(fuzzed): lower = name.lower() if lower == query: score += 50 if query in lower: score += 20 tweak.append((name, score)) tweak = list(sorted(tweak, key=lambda v: v[1], reverse=True)) return list(name for name, score in tweak)[:k]
async def convert(self, ctx: commands.Context, argument: str) -> discord.Role: try: basic_role = await super().convert(ctx, argument) except BadArgument: pass else: return basic_role guild = ctx.guild result = [] for r in process.extract( argument, {r: unidecode(r.name) for r in guild.roles}, limit=None, score_cutoff=75, ): result.append((r[2], r[1])) if not result: raise BadArgument( f'Role "{argument}" not found.' if self.response else None) sorted_result = sorted(result, key=lambda r: r[1], reverse=True) return sorted_result[0][0]
def find_matching_site(site_name: str, possible_sites: Dict) -> str: """Try and find a similar name to site_name in site_list and return a suggestion or error string. Args: site_name: Name of site site_list: List of sites to check Returns: str: Suggestion / error message """ from rapidfuzz import process site_list = possible_sites.keys() matches = process.extract(site_name, site_list) scores = [s for m, s, _ in matches] # This seems like a decent cutoff score for a decent find cutoff_score = 85 if scores[0] < cutoff_score: return f"No suggestion for {site_name}." elif scores[0] > cutoff_score and scores[0] > scores[1]: best_match = matches[0][0] return f"Did you mean {best_match.title()}, code: {possible_sites[best_match]} ?" elif scores[0] == scores[1]: suggestions = [ f"{match.title()}, code: {possible_sites[match]}" for match, _, _ in matches ] nl_char = "\n" return f"Did you mean one of : \n {nl_char.join(suggestions)}" else: return f"Unknown site: {site_name}"
async def search_for_players(name: str): players = dict(await NHLPlayers.select('player_id', 'full_name').gino.all()) match = process.extract(name, players) return match
def find_fitting_model(self, search_term: str, limit: int = 5) -> List[str]: """ Find a fitting model by entering a search_term (e.g.: Sensor). The methode returns a selection from up-to [limit] possibly fitting model names. If a model name was selected from the proposition the model can be retrieved with the methode: "get_class_by_name(selectedName)" Args: search_term (str): search term to find a model by name limit (int): Max Number of suggested results (default: 5) Returns: List[str], containing 0 to [limit] ordered propositions (best first) """ class_names = list(self.class_catalogue.keys()) suggestions = [ item[0] for item in process.extract(query=search_term.casefold(), choices=class_names, score_cutoff=50, limit=limit) ] return suggestions
def match_authors(name, allcontrib_names, allcontrib_login): """ Use fuzzy string matching to match names of committers to the names or handles mentioned in the allcontributors file. Args: name: str; name of committer allcontrib_names: list; names in allcontributorsrc file allcontrib_login: list; logins in allcontributorsrc file Returns: """ # First, match the name. If no match, try Github login matching = process.extractOne(name, allcontrib_names, scorer=fuzz.token_sort_ratio, score_cutoff=71) if not matching: # we likely haven't found a match yet, lets check Github handles matching = process.extract(name, allcontrib_login, scorer=fuzz.token_sort_ratio, score_cutoff=71) if matching: return [name, matching[0]], None else: return None, name
def has_wake_word(self, phrase): phrase_parts = phrase.split() test_word = False start_index = 0 retn = False if len(phrase_parts) == 1: test_word = phrase_parts[0] self.heard = "" elif len(phrase_parts) > 1: prefixes = ["ok", "hey"] test_word = False first_word, second_word = phrase_parts[0:2] extracted_processes = process.extract(first_word, prefixes) for extracted_process in extracted_processes: if extracted_process[1] > 80: test_word = second_word start_index = 2 if not test_word: test_word = first_word start_index = 1 if test_word and isinstance(test_word, str): fuzzed = fuzz.ratio(test_word.lower(), self.wake_word.lower()) retn = fuzzed >= 80 if retn: self.heard = " ".join(phrase_parts[start_index::]) return retn
def _get_page(self, topic, request_options=None): topics_list = self.get_topics_list() if topic.startswith(':'): topics_list = [x for x in topics_list if x.startswith(':')] else: topics_list = [x for x in topics_list if not x.startswith(':')] if _USING_FUZZYWUZZY: possible_topics = process.extract(topic, topics_list, scorer=fuzz.ratio)[:3] else: possible_topics = process.extract(topic, topics_list, limit=3, scorer=fuzz.ratio) possible_topics_text = "\n".join([(" * %s %s" % x) for x in possible_topics]) return """ Unknown topic. Do you mean one of these topics maybe? %s """ % possible_topics_text
async def command_not_found(self, string: str) -> "HelpQueryNotFound": """ Handles when a query does not match a valid command, group, cog or category. Will return an instance of the `HelpQueryNotFound` exception with the error message and possible matches. """ choices = list(await self.get_all_help_choices()) result = process.extract(default_process(string), choices, scorer=fuzz.ratio, score_cutoff=60, processor=None) return HelpQueryNotFound(f'Query "{string}" not found.', {choice[0]: choice[1] for choice in result})
def testIssue81(self): # this mostly tests whether this segfaults due to incorrect ref counting choices = pd.Series( ['test color brightness', 'test lemon', 'test lavender'], index=[67478, 67479, 67480]) matches = process.extract("test", choices) assert matches == [('test color brightness', 90.0, 67478), ('test lemon', 90.0, 67479), ('test lavender', 90.0, 67480)]
async def extract_from_list(self, ctx, argument, list_of_items, processors, unsure=False): """Uses multiple scorers and processors for a good mix of accuracy and fuzzy-ness""" combined_list = [] scorers = (fuzz.token_set_ratio, fuzz.WRatio) for scorer in scorers: for processor in processors: fuzzy_list = process.extract( argument, list_of_items, processor=processor, scorer=scorer, score_cutoff=80, limit=5, ) if fuzzy_list: combined_entries = [e[0] for e in combined_list] if ( processor == fuzz.WRatio ): # WRatio isn't the best, so we add in extra filters to make sure everythings turns out ok new_members = [ e for e in fuzzy_list if e[0] not in combined_entries and ( len(processor(e[0])) >= 2 or len(argument) <= 2 ) and argument.lower() in processor(e[0]) ] else: new_members = [ e for e in fuzzy_list if e[0] not in combined_entries and argument.lower() in processor(e[0]) ] combined_list.extend(new_members) if len(combined_list) > 1: if len(combined_list) > 5: combined_list = combined_list[:5] return await self.selection_handler(ctx, combined_list) if combined_list == []: return if len(combined_list) != 1: return await self.selection_handler(ctx, combined_list) if unsure and combined_list[0][1] < 95: # entries score await self.unsure_select_handler(ctx, combined_list[0][0]) return combined_list[0][0] # actual entry itself
def search(query: str, result_size: int = 5) -> List[NoteInDB]: all_notes_dict = {note.doc_id: note.content for note in Notes.all()} search_results = process.extract(query, choices=all_notes_dict, scorer=fuzz.token_set_ratio, limit=result_size, score_cutoff=20) return [ Notes.get_by_id(res_record[2]) for res_record in search_results ]
def rank(self, target, searches, limit=10): matches = process.extract(target, searches.keys(), limit=limit, scorer=fuzz.partial_ratio) matches = [(m[0], m[1] * math.log(searches[m[0]] + 1)) for m in matches if m[1] > 0] if matches: return [m[0] for m in sorted(matches, key=lambda d: -d[1])] return [target]
def _handle_not_found(self, query: str) -> None: """ Handles when a query does not match a valid command or cog. Will pass on possible close matches along with the `HelpQueryNotFound` exception. """ # Combine command and cog names choices = list(self._bot.all_commands) + list(self._bot.cogs) result = process.extract(query, choices, score_cutoff=90) raise HelpQueryNotFound(f'Query "{query}" not found.', dict(result))
def fuzzy_find(matches, urls): choices = {} for match in matches: for key, value in process.extract(match, urls, limit=None): choices.setdefault(key, 0) choices[key] += value choices = sorted([(v, k) for k, v in choices.items()], reverse=True) if not choices: return [] elif len(choices) == 1: return [choices[0][1]] elif choices[0][0] > choices[1][0]: choices = choices[:1] else: choices = list(takewhile(lambda t: t[0] == choices[0][0], choices)) return [v for k, v in choices]
def check_consistency(cls, values): """ Validate and auto complete unit data based on the UN/CEFACT data Args: values (dict): Values of a all data fields Returns: values (dict): Validated data """ units = load_units() name = values.get("name") code = values.get("code") if isinstance(code, UnitCode): code = code.value if isinstance(name, UnitText): name = name.value if code and name: idx = units.index[((units.CommonCode == code) & (units.Name == name))] if idx.empty: raise ValueError("Invalid combination of 'code' and 'name': ", code, name) elif code: idx = units.index[(units.CommonCode == code)] if idx.empty: raise ValueError("Invalid 'code': ", code) elif name: idx = units.index[(units.Name == name)] if idx.empty: names = units.Name.tolist() suggestions = [item[0] for item in process.extract( query=name.casefold(), choices=names, score_cutoff=50, limit=5)] raise ValueError(f"Invalid 'name' for unit! '{name}' \n " f"Did you mean one of the following? \n " f"{suggestions}") else: raise AssertionError("'name' or 'code' must be provided!") values["code"] = UnitCode(value=units.CommonCode[idx[0]]).value values["name"] = UnitText(value=units.Name[idx[0]]).value values["symbol"] = units.Symbol[idx[0]] values["conversion_factor"] = units.ConversionFactor[idx[0]] if not values.get("description"): values["description"] = units.Description[idx[0]] return values
def testWithScorer(self): choices = [ "new york mets vs chicago cubs", "chicago cubs at new york mets", "atlanta braves vs pittsbugh pirates", "new york yankees vs boston red sox" ] choices_mapping = { 1: "new york mets vs chicago cubs", 2: "chicago cubs at new york mets", 3: "atlanta braves vs pittsbugh pirates", 4: "new york yankees vs boston red sox" } # in this hypothetical example we care about ordering, so we use quick ratio query = "new york mets at chicago cubs" # first, as an example, the normal way would select the "more 'complete' match of choices[1]" best = process.extractOne(query, choices) self.assertEqual(best[0], choices[1]) best = process.extract(query, choices)[0] self.assertEqual(best[0], choices[1]) # dict best = process.extractOne(query, choices_mapping) self.assertEqual(best[0], choices_mapping[2]) best = process.extract(query, choices_mapping)[0] self.assertEqual(best[0], choices_mapping[2]) # now, use the custom scorer best = process.extractOne(query, choices, scorer=fuzz.QRatio) self.assertEqual(best[0], choices[0]) best = process.extract(query, choices, scorer=fuzz.QRatio)[0] self.assertEqual(best[0], choices[0]) # dict best = process.extractOne(query, choices_mapping, scorer=fuzz.QRatio) self.assertEqual(best[0], choices_mapping[1]) best = process.extract(query, choices_mapping, scorer=fuzz.QRatio)[0] self.assertEqual(best[0], choices_mapping[1])
def ner2slot(self, input_entity, slot): # Given named entity return normalized slot value if isinstance(input_entity, list): input_entity = ' '.join(input_entity) entities = [] normalized_slot_vals = [] for entity_name in self._slot_vals[slot]: for entity in self._slot_vals[slot][entity_name]: entities.append(entity) normalized_slot_vals.append(entity_name) best_match, score = process.extract(input_entity, entities, limit=2**20)[0] return normalized_slot_vals[entities.index(best_match)], score
def get_fuzzy_accounts(identifier, accounts): """Attempts to fuzz identifier and provides suggestions. :param identifier: identifier to fuzz :param accounts: list of accounts to compare :return: tuple. Possible accounts that could match. """ # collect all possibilities choices = [] for a in accounts: choices.append(a["name"]) choices += a["aliases"] return process.extract(identifier, choices, limit=3)
def validate_text(cls, value): units = load_units() if len(units.loc[(units.Name.str.casefold() == value.casefold())]) >= 1: return value names = units.Name.tolist() suggestions = [item[0] for item in process.extract( query=value.casefold(), choices=names, score_cutoff=50, limit=5)] raise ValueError(f"Invalid 'name' for unit! '{value}' \n " f"Did you mean one of the following? \n " f"{suggestions}")
def part2(puzzle_input: list) -> str: for teststr in puzzle_input: # Get closest match, ignore self match test = process.extract(teststr, puzzle_input, limit=2)[1] # Match should differ by 1 character, calculate this percentage and use as a threshold diff_threshold = floor(((len(teststr) - 1) / len(teststr)) * 100) if test[1] >= diff_threshold: a = list(test[0]) b = list(teststr) for idx, letter in enumerate(a): if b[idx] != letter: b.pop(idx) return "".join(b)
async def order(self, keywords, queue): if keywords[0] != "td": return kw = " ".join(keywords[1:]) if kw == "": return from rapidfuzz import fuzz, process for _, score, idx in process.extract( kw, self.for_compare, limit=self.num_candidates, scorer=fuzz.partial_token_sort_ratio, score_cutoff=self.score_cutoff, ): queue.put(TeXPdfFile(self.files[idx], score + 50))
def get_fuzzy_role(self, ctx, name: str): result = [] for r in process.extract( name, {r: unidecode(r.name) for r in ctx.guild.roles}, limit=None, score_cutoff=75, ): result.append((r[2], r[1])) if not result: raise BadArgument("{} is not a valid role") sorted_result = sorted(result, key=lambda r: r[1], reverse=True) return sorted_result[0][0]
def Match(self, query: str): logger.info(f"Matching {query}") actions = self.runners_manager.actions() results = process.extract(query, [x.key for x in actions], scorer=fuzz.token_sort_ratio, score_cutoff=25) logger.info(f"Search results: {results}") output = [] for result_key, score, _ in results: output.append( self.runners_manager.get_action_by_key( result_key).to_dbus_tuple(score)) return output