Beispiel #1
0
    def compare_to_query(self, search_query: str) -> float:
        score = 0.0
        processed_query = self.parse_search_query(search_query)

        # We compare all the single parts of the query, with & without the ".params."
        for target, value in zip(processed_query, self._parsed_name):
            score += fuzz.ratio(target, value)
        for target, value in zip(processed_query, self._parsed_name_wo_params):
            score += fuzz.ratio(target, value)
        # ... and the full name because we're generous with & without leading "parameter"
        score += fuzz.ratio(search_query, self.name)
        score += fuzz.ratio(search_query, f"parameter {self.name}")

        # To stay <= 100 as not to overrule other results
        return score / 4
Beispiel #2
0
    def compare_to_query(self, search_query: str) -> float:
        score = 0.0
        processed_query = self.parse_search_query(search_query)

        # We compare all the single parts of the query …
        for target, value in zip(processed_query, self._parsed_name):
            score += fuzz.ratio(target, value)
        # ... and the full name because we're generous
        score += fuzz.ratio(search_query, self.name)
        # To stay <= 100 as not to overrule other results
        score = score / 2

        # IISC std: is the domain for general stuff like headlines and chapters.
        # we'll wanna give those a little less weight
        if self.entry_type.startswith("std:"):
            score *= 0.8
        return score
Beispiel #3
0
async def check_for_bad_links(message, domains):
    log_channel = client.get_channel(int(LOG_CHANNEL))
    for domain in domains:
        ratio = fuzz.ratio(REAL_GIFT_DOMAIN, domain)
        if ratio >= MIN_RATIO and ratio < 100:
            await message.delete()
            await log_channel.send(
                f'Deleted message from <@{message.author.id}> for suspicious link'
            )
            break
def best_fuzzy_match(headline, articles):
    best_match_score = 0
    best_match_headline = None
    best_match_articleID = None
    for j in range(articles.shape[0]):
        text = articles.headline[j]
        match_score = fuzz.ratio(headline, text)
        if match_score > best_match_score:
            best_match_score = match_score
            best_match_headline = articles.headline[j]
            best_match_articleID = articles.total_article_number[j]
    return best_match_score, best_match_headline, best_match_articleID
Beispiel #5
0
async def get_nbnhhsh(keyword: str) -> Tuple[str, str]:
    url = "https://lab.magiconch.com/api/nbnhhsh/guess"
    headers = {"referer": "https://lab.magiconch.com/nbnhhsh/"}
    data = {"text": keyword}
    async with httpx.AsyncClient() as client:
        resp = await client.post(url=url, headers=headers, data=data)
        res = resp.json()
    title = ""
    result = []
    for i in res:
        if "trans" in i:
            if i["trans"]:
                title = i["name"]
                result.append(f"{i['name']} => {','.join(i['trans'])}")
    result = "\n".join(result)
    if fuzz.ratio(title.lower(), keyword.lower()) < 90:
        return "", ""
    return title, result
Beispiel #6
0
    def dp_search(
        self,
        query: str,
        nlp: Any,
        entity_type: str = "",
        entity_patterns: List[str] = [""],
        match_dict: Dict[Any, Any] = {},
    ) -> Tuple[Text, Label, Value, Span, Score]:

        sentence = nlp(query).sentences[0]
        value = ""
        pos_tags = ["PROPN", "NOUN", "ADP"]
        result_dict = {}
        for word in sentence.words:
            if word.upos in pos_tags:
                if value == "":
                    span_start = word.start_char
                span_end = word.end_char
                """
                joining individual tokens that together are the real entity,
                Since we are dealing with Multi-Word entities here

                """
                value = value + str(word.text) + " "
        if value != "":
            for pattern in entity_patterns:
                val = fuzz.ratio(pattern, value) / 100
                if val > self.fuzzy_threshold:
                    match_value = match_dict[pattern]
                    result_dict[match_value] = val
            if result_dict:
                match_output = max(result_dict, key=lambda x: result_dict[x])
                match_score = result_dict[match_output]

                return (
                    value,
                    entity_type,
                    match_output,
                    (span_start, span_end),
                    match_score,
                )
        return (value, entity_type, "", (0, 0), 0.0)
Beispiel #7
0
async def get_baidu(keyword: str) -> Tuple[str, str]:
    content = getBaike(keyword)
    if not content:
        return "", ""

    match_obj = re.match(r"(.*?)((.*?)?)\n(.*)", content)
    if not match_obj:
        return "", ""

    title = match_obj.group(1)
    subtitle = match_obj.group(2)
    text = match_obj.group(3)
    if fuzz.ratio(title.lower(), keyword.lower()) < 90:
        return "", ""

    msg = title
    if subtitle:
        msg += subtitle
    msg += ":\n---------------\n" + text
    return title, msg
Beispiel #8
0
async def get_jiki(keyword: str) -> Tuple[str, Union[str, Message]]:
    keyword = quote(keyword)
    search_url = "https://jikipedia.com/search?phrase={}".format(keyword)
    async with httpx.AsyncClient() as client:
        resp = await client.get(url=search_url)
        result = resp.text

    if "对不起!小鸡词典暂未收录该词条" in result:
        return "", ""

    dom = etree.HTML(result, etree.HTMLParser())
    card_urls = dom.xpath(
        "//div[contains(@class, 'masonry')]/div/div/div/a[contains(@class, 'title-container')]/@href"
    )
    if not card_urls:
        return "", ""

    card_url = card_urls[0]
    async with httpx.AsyncClient() as client:
        resp = await client.get(url=card_url)
        result = resp.text

    dom = etree.HTML(result, etree.HTMLParser())
    title = dom.xpath(
        "//div[@class='section card-middle']/div[@class='title-container']/div/h1/text()"
    )[0]
    content = dom.xpath(
        "//div[@class='section card-middle']/div[@class='content']/div")[0]
    content = content.xpath("string(.)").strip()
    img_urls = dom.xpath(
        "//div[@class='section card-middle']/div/div/div[@class='show-images']/img/@src"
    )
    if fuzz.ratio(str(title).lower(), keyword.lower()) < 90:
        return "", ""

    msg = Message()
    msg.append(title + ":\n---------------\n")
    msg.append(content)
    for img_url in img_urls:
        msg.append(MessageSegment.image(file=img_url))
    return title, msg
Beispiel #9
0
async def get_content(
        keyword: str,
        sources=["jiki", "baidu", "nbnhhsh"]) -> Union[str, Message]:
    result = ""
    msgs: List[Tuple[str, Union[str, Message]]] = []
    for s in sources:
        try:
            title, msg = await sources_func[s](keyword)
            if title and msg:
                msgs.append((title, msg))
        except Exception as e:
            logger.warning(f"Error in get_content({keyword}) using {s}: {e}")

    if len(msgs) == 1:
        result = msgs[0][1]
    elif len(msgs) > 1:
        msgs = sorted(msgs,
                      key=lambda m: fuzz.ratio(m[0].lower(), keyword.lower()),
                      reverse=True)
        result = msgs[0][1]
    return result
Beispiel #10
0
    async def get_deviantart_posts(self, msg: discord.Message, urls: list[str]):
        """Automatically fetch multiple posts from deviantart"""
        MAX_EMBEDS = 5
        title_to_test_against = urls[0].split('/')[-1].rsplit('-', maxsplit=1)[0]
        similarity_ratio = 0
        for url in urls[1:]:
            title = url.split('/')[-1].rsplit('-', maxsplit=1)[0]
            similarity_ratio += fuzz.ratio(title, title_to_test_against)
            print(f"{title}: {title_to_test_against} ({fuzz.ratio(title, title_to_test_against)})")

        display_as_singles = False
        similarity_ratio /= len(urls) - 1
        print(f"Url similarity ratio: {similarity_ratio}")
        if similarity_ratio < 90:
            print("Urls seem unrelated from each other. Sending each embed individually.")
            display_as_singles = True

        # Check what type the first post is and if subsequent posts are of different types,
        # send them in one batch, but using different embed groups
        base_type: str = None
        api_results = []
        for url in urls:
            if not (post_id := self.get_id(url)):
                return

            search_url = self.bot.assets['deviantart']['search_url_extended'].format(post_id)
            err_msg = f"Error fetching DA post #{post_id}"
            api_result = (await net_core.http_request(search_url, json=True, err_msg=err_msg)).json

            deviation = api_result['deviation']
            deviation_type = deviation['type']

            if base_type is None:
                base_type = deviation_type

            if deviation_type != base_type:
                print("Deviation types differ. Sending each embed individually.")
                display_as_singles = True

            api_results.append(api_result)
Beispiel #11
0
    async def dnd_spell_autocomplete(self, ctx: commands.SlashContext,
                                     interaction: discord.Interaction[None]):
        """
        Fuzzy match what the user input and the list of all spells.
        """

        # Get what the user gave us
        assert interaction.options is not None
        user_input = interaction.options[0].options[0].value
        assert user_input

        # Determine what's closest to what they said
        fuzzed = [(
            i,
            fuzz.ratio(i, user_input),
        ) for i in self.all_spells if user_input.casefold() in i.casefold()]
        fuzzed.sort(key=operator.itemgetter(1), reverse=True)

        # And give them the top results
        await interaction.response.send_autocomplete([
            discord.ApplicationCommandOptionChoice(name=i, value=i)
            for i, _ in fuzzed[:25]
        ])
Beispiel #12
0
 def testCaseInsensitive(self):
     self.assertNotEqual(fuzz.ratio(self.s1, self.s2), 100)
     self.assertEqual(fuzz.ratio(utils.full_process(self.s1), utils.full_process(self.s2)), 100)
Beispiel #13
0
 def compare_to_query(self, search_query: str) -> float:
     parts = search_query.lstrip("/").split(maxsplit=1)
     if parts:
         return fuzz.ratio(self.tag, parts[0])
     return 0
Beispiel #14
0
 def compare_to_query(self, search_query: str) -> float:
     # Here we just assume that everything before thi first / is ptbcontrib
     # (modulo typos). That could be wrong, but then it's the users fault :)
     search_query = search_query.split("/", maxsplit=1)[-1]
     return fuzz.ratio(self.name, search_query)
Beispiel #15
0
def similar(a, b):
    return (fuzz.ratio(a, b))
Beispiel #16
0
 def testEqual(self):
     self.assertEqual(fuzz.ratio(self.s1, self.s1a), 100)
     self.assertEqual(fuzz.ratio(self.s8, self.s8a), 100)
     self.assertEqual(fuzz.ratio(self.s9, self.s9a), 100)
Beispiel #17
0
 def testRatioUnicodeString(self):
     s1 = "\u00C1"
     s2 = "ABCD"
     score = fuzz.ratio(s1, s2)
     self.assertEqual(0, score)
Beispiel #18
0
 def testEmptyStringsScore100(self):
     self.assertEqual(fuzz.ratio("", ""), 100)
     self.assertEqual(fuzz.partial_ratio("", ""), 100)
Beispiel #19
0
# -*- coding:utf-8 -*-
"""
参考 https://github.com/seatgeek/thefuzz
"""
__author__ = "aaron.qiu"

from pprint import pprint
from thefuzz import fuzz
from thefuzz import process

if __name__ == '__main__':
    pprint(fuzz.ratio("this is a test", "this is a test!"))
    pprint(fuzz.partial_ratio("this is a test", "this is a test!"))
    pprint(fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear"))
    pprint(
        fuzz.token_sort_ratio("fuzzy wuzzy was a bear",
                              "wuzzy fuzzy was a bear"))
    pprint(fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear"))
    pprint(fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear"))
    choices = [
        "Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"
    ]
    pprint(process.extract("new york jets", choices, limit=2))
    pprint(process.extractOne("cowboys", choices))
    songs = "/data/soft"
    pprint(process.extractOne("System of a down - Hypnotize - apache", songs))
    process.extractOne("System of a down - Hypnotize - Heroin",
                       songs,
                       scorer=fuzz.token_sort_ratio)