def compare_to_query(self, search_query: str) -> float: score = 0.0 processed_query = self.parse_search_query(search_query) # We compare all the single parts of the query, with & without the ".params." for target, value in zip(processed_query, self._parsed_name): score += fuzz.ratio(target, value) for target, value in zip(processed_query, self._parsed_name_wo_params): score += fuzz.ratio(target, value) # ... and the full name because we're generous with & without leading "parameter" score += fuzz.ratio(search_query, self.name) score += fuzz.ratio(search_query, f"parameter {self.name}") # To stay <= 100 as not to overrule other results return score / 4
def compare_to_query(self, search_query: str) -> float: score = 0.0 processed_query = self.parse_search_query(search_query) # We compare all the single parts of the query … for target, value in zip(processed_query, self._parsed_name): score += fuzz.ratio(target, value) # ... and the full name because we're generous score += fuzz.ratio(search_query, self.name) # To stay <= 100 as not to overrule other results score = score / 2 # IISC std: is the domain for general stuff like headlines and chapters. # we'll wanna give those a little less weight if self.entry_type.startswith("std:"): score *= 0.8 return score
async def check_for_bad_links(message, domains): log_channel = client.get_channel(int(LOG_CHANNEL)) for domain in domains: ratio = fuzz.ratio(REAL_GIFT_DOMAIN, domain) if ratio >= MIN_RATIO and ratio < 100: await message.delete() await log_channel.send( f'Deleted message from <@{message.author.id}> for suspicious link' ) break
def best_fuzzy_match(headline, articles): best_match_score = 0 best_match_headline = None best_match_articleID = None for j in range(articles.shape[0]): text = articles.headline[j] match_score = fuzz.ratio(headline, text) if match_score > best_match_score: best_match_score = match_score best_match_headline = articles.headline[j] best_match_articleID = articles.total_article_number[j] return best_match_score, best_match_headline, best_match_articleID
async def get_nbnhhsh(keyword: str) -> Tuple[str, str]: url = "https://lab.magiconch.com/api/nbnhhsh/guess" headers = {"referer": "https://lab.magiconch.com/nbnhhsh/"} data = {"text": keyword} async with httpx.AsyncClient() as client: resp = await client.post(url=url, headers=headers, data=data) res = resp.json() title = "" result = [] for i in res: if "trans" in i: if i["trans"]: title = i["name"] result.append(f"{i['name']} => {','.join(i['trans'])}") result = "\n".join(result) if fuzz.ratio(title.lower(), keyword.lower()) < 90: return "", "" return title, result
def dp_search( self, query: str, nlp: Any, entity_type: str = "", entity_patterns: List[str] = [""], match_dict: Dict[Any, Any] = {}, ) -> Tuple[Text, Label, Value, Span, Score]: sentence = nlp(query).sentences[0] value = "" pos_tags = ["PROPN", "NOUN", "ADP"] result_dict = {} for word in sentence.words: if word.upos in pos_tags: if value == "": span_start = word.start_char span_end = word.end_char """ joining individual tokens that together are the real entity, Since we are dealing with Multi-Word entities here """ value = value + str(word.text) + " " if value != "": for pattern in entity_patterns: val = fuzz.ratio(pattern, value) / 100 if val > self.fuzzy_threshold: match_value = match_dict[pattern] result_dict[match_value] = val if result_dict: match_output = max(result_dict, key=lambda x: result_dict[x]) match_score = result_dict[match_output] return ( value, entity_type, match_output, (span_start, span_end), match_score, ) return (value, entity_type, "", (0, 0), 0.0)
async def get_baidu(keyword: str) -> Tuple[str, str]: content = getBaike(keyword) if not content: return "", "" match_obj = re.match(r"(.*?)((.*?)?)\n(.*)", content) if not match_obj: return "", "" title = match_obj.group(1) subtitle = match_obj.group(2) text = match_obj.group(3) if fuzz.ratio(title.lower(), keyword.lower()) < 90: return "", "" msg = title if subtitle: msg += subtitle msg += ":\n---------------\n" + text return title, msg
async def get_jiki(keyword: str) -> Tuple[str, Union[str, Message]]: keyword = quote(keyword) search_url = "https://jikipedia.com/search?phrase={}".format(keyword) async with httpx.AsyncClient() as client: resp = await client.get(url=search_url) result = resp.text if "对不起!小鸡词典暂未收录该词条" in result: return "", "" dom = etree.HTML(result, etree.HTMLParser()) card_urls = dom.xpath( "//div[contains(@class, 'masonry')]/div/div/div/a[contains(@class, 'title-container')]/@href" ) if not card_urls: return "", "" card_url = card_urls[0] async with httpx.AsyncClient() as client: resp = await client.get(url=card_url) result = resp.text dom = etree.HTML(result, etree.HTMLParser()) title = dom.xpath( "//div[@class='section card-middle']/div[@class='title-container']/div/h1/text()" )[0] content = dom.xpath( "//div[@class='section card-middle']/div[@class='content']/div")[0] content = content.xpath("string(.)").strip() img_urls = dom.xpath( "//div[@class='section card-middle']/div/div/div[@class='show-images']/img/@src" ) if fuzz.ratio(str(title).lower(), keyword.lower()) < 90: return "", "" msg = Message() msg.append(title + ":\n---------------\n") msg.append(content) for img_url in img_urls: msg.append(MessageSegment.image(file=img_url)) return title, msg
async def get_content( keyword: str, sources=["jiki", "baidu", "nbnhhsh"]) -> Union[str, Message]: result = "" msgs: List[Tuple[str, Union[str, Message]]] = [] for s in sources: try: title, msg = await sources_func[s](keyword) if title and msg: msgs.append((title, msg)) except Exception as e: logger.warning(f"Error in get_content({keyword}) using {s}: {e}") if len(msgs) == 1: result = msgs[0][1] elif len(msgs) > 1: msgs = sorted(msgs, key=lambda m: fuzz.ratio(m[0].lower(), keyword.lower()), reverse=True) result = msgs[0][1] return result
async def get_deviantart_posts(self, msg: discord.Message, urls: list[str]): """Automatically fetch multiple posts from deviantart""" MAX_EMBEDS = 5 title_to_test_against = urls[0].split('/')[-1].rsplit('-', maxsplit=1)[0] similarity_ratio = 0 for url in urls[1:]: title = url.split('/')[-1].rsplit('-', maxsplit=1)[0] similarity_ratio += fuzz.ratio(title, title_to_test_against) print(f"{title}: {title_to_test_against} ({fuzz.ratio(title, title_to_test_against)})") display_as_singles = False similarity_ratio /= len(urls) - 1 print(f"Url similarity ratio: {similarity_ratio}") if similarity_ratio < 90: print("Urls seem unrelated from each other. Sending each embed individually.") display_as_singles = True # Check what type the first post is and if subsequent posts are of different types, # send them in one batch, but using different embed groups base_type: str = None api_results = [] for url in urls: if not (post_id := self.get_id(url)): return search_url = self.bot.assets['deviantart']['search_url_extended'].format(post_id) err_msg = f"Error fetching DA post #{post_id}" api_result = (await net_core.http_request(search_url, json=True, err_msg=err_msg)).json deviation = api_result['deviation'] deviation_type = deviation['type'] if base_type is None: base_type = deviation_type if deviation_type != base_type: print("Deviation types differ. Sending each embed individually.") display_as_singles = True api_results.append(api_result)
async def dnd_spell_autocomplete(self, ctx: commands.SlashContext, interaction: discord.Interaction[None]): """ Fuzzy match what the user input and the list of all spells. """ # Get what the user gave us assert interaction.options is not None user_input = interaction.options[0].options[0].value assert user_input # Determine what's closest to what they said fuzzed = [( i, fuzz.ratio(i, user_input), ) for i in self.all_spells if user_input.casefold() in i.casefold()] fuzzed.sort(key=operator.itemgetter(1), reverse=True) # And give them the top results await interaction.response.send_autocomplete([ discord.ApplicationCommandOptionChoice(name=i, value=i) for i, _ in fuzzed[:25] ])
def testCaseInsensitive(self): self.assertNotEqual(fuzz.ratio(self.s1, self.s2), 100) self.assertEqual(fuzz.ratio(utils.full_process(self.s1), utils.full_process(self.s2)), 100)
def compare_to_query(self, search_query: str) -> float: parts = search_query.lstrip("/").split(maxsplit=1) if parts: return fuzz.ratio(self.tag, parts[0]) return 0
def compare_to_query(self, search_query: str) -> float: # Here we just assume that everything before thi first / is ptbcontrib # (modulo typos). That could be wrong, but then it's the users fault :) search_query = search_query.split("/", maxsplit=1)[-1] return fuzz.ratio(self.name, search_query)
def similar(a, b): return (fuzz.ratio(a, b))
def testEqual(self): self.assertEqual(fuzz.ratio(self.s1, self.s1a), 100) self.assertEqual(fuzz.ratio(self.s8, self.s8a), 100) self.assertEqual(fuzz.ratio(self.s9, self.s9a), 100)
def testRatioUnicodeString(self): s1 = "\u00C1" s2 = "ABCD" score = fuzz.ratio(s1, s2) self.assertEqual(0, score)
def testEmptyStringsScore100(self): self.assertEqual(fuzz.ratio("", ""), 100) self.assertEqual(fuzz.partial_ratio("", ""), 100)
# -*- coding:utf-8 -*- """ 参考 https://github.com/seatgeek/thefuzz """ __author__ = "aaron.qiu" from pprint import pprint from thefuzz import fuzz from thefuzz import process if __name__ == '__main__': pprint(fuzz.ratio("this is a test", "this is a test!")) pprint(fuzz.partial_ratio("this is a test", "this is a test!")) pprint(fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")) pprint( fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")) pprint(fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")) pprint(fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")) choices = [ "Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys" ] pprint(process.extract("new york jets", choices, limit=2)) pprint(process.extractOne("cowboys", choices)) songs = "/data/soft" pprint(process.extractOne("System of a down - Hypnotize - apache", songs)) process.extractOne("System of a down - Hypnotize - Heroin", songs, scorer=fuzz.token_sort_ratio)