Example #1
0
def match_license(name: str) -> dict:
    """Match if the given license name matches any license present on
    spdx.org

    :param name: License name
    :return: Information of the license matched
    """
    all_licenses = get_all_licenses_from_spdx()
    name = re.sub(r"\s+license\s*", "", name.strip(), flags=re.IGNORECASE)

    best_matches = process.extract(name, _get_all_license_choice(all_licenses))
    spdx_license = best_matches[0]
    if spdx_license[1] != 100:
        best_matches = [
            lic[0] for lic in best_matches if not lic[0].endswith("-only")
        ]

        if best_matches:
            best_matches = process.extract(name,
                                           best_matches,
                                           scorer=token_set_ratio)
            spdx_license = best_matches[0]
            best_matches = [
                lic[0] for lic in best_matches if lic[1] >= spdx_license[1]
            ]
            if len(best_matches) > 1:
                spdx_license = process.extractOne(name,
                                                  best_matches,
                                                  scorer=token_sort_ratio)

    log.info(f"Best match for license {name} was {spdx_license}.\n"
             f"Best matches: {best_matches}")

    return _get_license(spdx_license[0], all_licenses)
Example #2
0
    async def convert(self, ctx: commands.Context, argument: str) -> List[discord.Member]:
        bot = ctx.bot
        match = self._get_id_match(argument) or re.match(r"<@!?([0-9]+)>$", argument)
        guild = ctx.guild
        result = []
        if match is None:
            # Not a mention
            if guild:
                for m in process.extract(
                    argument,
                    {m: unidecode(m.name) for m in guild.members},
                    limit=None,
                    score_cutoff=75,
                ):
                    result.append(m[2])
                for m in process.extract(
                    argument,
                    {m: unidecode(m.nick) for m in guild.members if m.nick and m not in result},
                    limit=None,
                    score_cutoff=75,
                ):
                    result.append(m[2])
        else:
            user_id = int(match.group(1))
            if guild:
                result.append(guild.get_member(user_id))
            else:
                result.append(_get_from_guilds(bot, "get_member", user_id))

        if not result or result == [None]:
            raise BadArgument('Member "{}" not found'.format(argument))

        return result
Example #3
0
    def search_audiobooks(self,
                          since=None,
                          author=None,
                          title=None,
                          tag=None,
                          limit=25):
        """
        Args:
            since: a UNIX timestamp; returns all projects cataloged since that time
            author: all records by that author last name
            title: all matching titles
            tag: all projects of the matching tag
            limit: max entries to return (int)

        Returns:
            list : list of AudioBook objects
        """
        # priority for title matches
        alll = self.get_all_audiobooks()
        if title:
            for res in process.extract(title, alll, limit=limit):
                match, score = res
                yield match
                alll.remove(match)

        # second author matches
        if author:
            choices = [" ".join([str(a) for a in b.authors]) for b in alll]
            for res in process.extract(author, choices, limit=limit):
                match, score = res
                match = alll[choices.index(match)]
                yield match
                alll.remove(match)
Example #4
0
def fuzzy_match(query, mapped_choices, limit=10, score_cutoff=88):
    best_matches = [
        {"match": match, "score": score, "result": result}
        for (match, score, result) in process.extract(query, mapped_choices, limit=limit, score_cutoff=score_cutoff)
    ]
    return best_matches or [
        {"match": match, "score": score, "result": result}
        for (match, score, result) in process.extract(query, mapped_choices, limit=limit)
    ]
Example #5
0
    async def convert(self, ctx: commands.Context, argument: str) -> List[discord.Guild]:
        bot = ctx.bot
        match = self._get_id_match(argument)
        result = []
        if not await bot.is_owner(ctx.author):
            # Don't need to be snooping other guilds unless we're
            # the bot owner
            raise BadArgument(_("That option is only available for the bot owner."))
        if not match:
            # Not a mention
            for g in process.extract(
                argument, {g: unidecode(g.name) for g in bot.guilds}, limit=None, score_cutoff=75
            ):
                result.append(g[2])
        else:
            guild_id = int(match.group(1))
            guild = bot.get_guild(guild_id)
            if not guild:
                raise BadArgument('Guild "{}" not found'.format(argument))
            result.append(guild)

        if not result:
            raise BadArgument('Guild "{}" not found'.format(argument))

        return result
Example #6
0
    async def get_fuzzy_member(self, ctx, name):
        user = discord.utils.get(ctx.guild.members, name=name)
        if user:
            await self.config.member(user).storedname.set(name)
            return user
        all_users = await self.config.all_members(ctx.guild)

        for user, data in all_users.items():
            user = ctx.guild.get_member(int(user))
            if not user:
                continue
            if data["storedname"] == name:
                return user

        result = []
        for r in process.extract(
            name,
            {m: self.decode_cancer_name(m.name) for m in ctx.guild.members},
            limit=None,
            score_cutoff=75,
        ):
            result.append((r[2], r[1]))

            sorted_result = sorted(result, key=lambda r: r[1], reverse=True)
            await self.config.member(sorted_result[0][0]).storedname.set(name)
            return sorted_result[0][0]
Example #7
0
    def search_docs(self, query, k=8, make_default=False):
        query = query.strip().lower()

        if not query:
            return choices(self._docs_names, k=k) if make_default else None

        # further fuzzy search it using rapidfuzz ratio matching
        fuzzed = process.extract(
            query=query,
            choices=self._docs_names,
            scorer=fuzz.ratio,
            processor=None,
            limit=max(k, 8),
        )

        tweak = list()

        for idx, (name, score, junk) in enumerate(fuzzed):
            lower = name.lower()

            if lower == query:
                score += 50

            if query in lower:
                score += 20

            tweak.append((name, score))

        tweak = list(sorted(tweak, key=lambda v: v[1], reverse=True))

        return list(name for name, score in tweak)[:k]
Example #8
0
    async def convert(self, ctx: commands.Context,
                      argument: str) -> discord.Role:
        try:
            basic_role = await super().convert(ctx, argument)
        except BadArgument:
            pass
        else:
            return basic_role
        guild = ctx.guild
        result = []
        for r in process.extract(
                argument,
            {r: unidecode(r.name)
             for r in guild.roles},
                limit=None,
                score_cutoff=75,
        ):
            result.append((r[2], r[1]))

        if not result:
            raise BadArgument(
                f'Role "{argument}" not found.' if self.response else None)

        sorted_result = sorted(result, key=lambda r: r[1], reverse=True)
        return sorted_result[0][0]
Example #9
0
def find_matching_site(site_name: str, possible_sites: Dict) -> str:
    """Try and find a similar name to site_name in site_list and return a suggestion or
    error string.

    Args:
        site_name: Name of site
        site_list: List of sites to check
    Returns:
        str: Suggestion / error message
    """
    from rapidfuzz import process

    site_list = possible_sites.keys()

    matches = process.extract(site_name, site_list)

    scores = [s for m, s, _ in matches]

    # This seems like a decent cutoff score for a decent find
    cutoff_score = 85

    if scores[0] < cutoff_score:
        return f"No suggestion for {site_name}."
    elif scores[0] > cutoff_score and scores[0] > scores[1]:
        best_match = matches[0][0]
        return f"Did you mean {best_match.title()}, code: {possible_sites[best_match]} ?"
    elif scores[0] == scores[1]:
        suggestions = [
            f"{match.title()}, code: {possible_sites[match]}"
            for match, _, _ in matches
        ]
        nl_char = "\n"
        return f"Did you mean one of : \n {nl_char.join(suggestions)}"
    else:
        return f"Unknown site: {site_name}"
Example #10
0
async def search_for_players(name: str):
    players = dict(await NHLPlayers.select('player_id',
                                           'full_name').gino.all())

    match = process.extract(name, players)

    return match
Example #11
0
    def find_fitting_model(self,
                           search_term: str,
                           limit: int = 5) -> List[str]:
        """
        Find a fitting model by entering a search_term (e.g.: Sensor).
        The methode returns a selection from up-to [limit] possibly fitting
        model names. If a model name was selected from the proposition the
        model can be retrieved with the methode:
        "get_class_by_name(selectedName)"

        Args:
            search_term (str): search term to find a model by name
            limit (int): Max Number of suggested results (default: 5)

        Returns:
            List[str], containing 0 to [limit] ordered propositions (best first)
        """
        class_names = list(self.class_catalogue.keys())
        suggestions = [
            item[0] for item in process.extract(query=search_term.casefold(),
                                                choices=class_names,
                                                score_cutoff=50,
                                                limit=limit)
        ]

        return suggestions
Example #12
0
def match_authors(name, allcontrib_names, allcontrib_login):
    """
    Use fuzzy string matching to match names of committers
    to the names or handles mentioned in the allcontributors file.

    Args:
        name: str; name of committer
        allcontrib_names: list; names in allcontributorsrc file
        allcontrib_login: list; logins in allcontributorsrc file

    Returns:

    """

    # First, match the name. If no match, try Github login
    matching = process.extractOne(name,
                                  allcontrib_names,
                                  scorer=fuzz.token_sort_ratio,
                                  score_cutoff=71)
    if not matching:
        # we likely haven't found a match yet, lets check Github handles
        matching = process.extract(name,
                                   allcontrib_login,
                                   scorer=fuzz.token_sort_ratio,
                                   score_cutoff=71)
    if matching:
        return [name, matching[0]], None
    else:
        return None, name
Example #13
0
    def has_wake_word(self, phrase):
        phrase_parts = phrase.split()

        test_word = False
        start_index = 0
        retn = False

        if len(phrase_parts) == 1:
            test_word = phrase_parts[0]
            self.heard = ""

        elif len(phrase_parts) > 1:
            prefixes = ["ok", "hey"]

            test_word = False

            first_word, second_word = phrase_parts[0:2]
            extracted_processes = process.extract(first_word, prefixes)
            for extracted_process in extracted_processes:
                if extracted_process[1] > 80:
                    test_word = second_word
                    start_index = 2

            if not test_word:
                test_word = first_word
                start_index = 1

        if test_word and isinstance(test_word, str):
            fuzzed = fuzz.ratio(test_word.lower(), self.wake_word.lower())
            retn = fuzzed >= 80

        if retn:
            self.heard = " ".join(phrase_parts[start_index::])

        return retn
Example #14
0
    def _get_page(self, topic, request_options=None):
        topics_list = self.get_topics_list()
        if topic.startswith(':'):
            topics_list = [x for x in topics_list if x.startswith(':')]
        else:
            topics_list = [x for x in topics_list if not x.startswith(':')]

        if _USING_FUZZYWUZZY:
            possible_topics = process.extract(topic, topics_list, scorer=fuzz.ratio)[:3]
        else:
            possible_topics = process.extract(topic, topics_list, limit=3, scorer=fuzz.ratio)
        possible_topics_text = "\n".join([("    * %s %s" % x) for x in possible_topics])
        return """
Unknown topic.
Do you mean one of these topics maybe?

%s
    """ % possible_topics_text
Example #15
0
    async def command_not_found(self, string: str) -> "HelpQueryNotFound":
        """
        Handles when a query does not match a valid command, group, cog or category.

        Will return an instance of the `HelpQueryNotFound` exception with the error message and possible matches.
        """
        choices = list(await self.get_all_help_choices())
        result = process.extract(default_process(string), choices, scorer=fuzz.ratio, score_cutoff=60, processor=None)
        return HelpQueryNotFound(f'Query "{string}" not found.', {choice[0]: choice[1] for choice in result})
Example #16
0
 def testIssue81(self):
     # this mostly tests whether this segfaults due to incorrect ref counting
     choices = pd.Series(
         ['test color brightness', 'test lemon', 'test lavender'],
         index=[67478, 67479, 67480])
     matches = process.extract("test", choices)
     assert matches == [('test color brightness', 90.0, 67478),
                        ('test lemon', 90.0, 67479),
                        ('test lavender', 90.0, 67480)]
Example #17
0
    async def extract_from_list(self,
                                ctx,
                                argument,
                                list_of_items,
                                processors,
                                unsure=False):
        """Uses multiple scorers and processors for a good mix of accuracy and fuzzy-ness"""
        combined_list = []

        scorers = (fuzz.token_set_ratio, fuzz.WRatio)

        for scorer in scorers:
            for processor in processors:
                fuzzy_list = process.extract(
                    argument,
                    list_of_items,
                    processor=processor,
                    scorer=scorer,
                    score_cutoff=80,
                    limit=5,
                )
                if fuzzy_list:
                    combined_entries = [e[0] for e in combined_list]

                    if (
                            processor == fuzz.WRatio
                    ):  # WRatio isn't the best, so we add in extra filters to make sure everythings turns out ok
                        new_members = [
                            e for e in fuzzy_list
                            if e[0] not in combined_entries and (
                                len(processor(e[0])) >= 2 or len(argument) <= 2
                            ) and argument.lower() in processor(e[0])
                        ]

                    else:
                        new_members = [
                            e for e in fuzzy_list
                            if e[0] not in combined_entries
                            and argument.lower() in processor(e[0])
                        ]

                    combined_list.extend(new_members)

                    if len(combined_list) > 1:
                        if len(combined_list) > 5:
                            combined_list = combined_list[:5]
                        return await self.selection_handler(ctx, combined_list)

        if combined_list == []:
            return

        if len(combined_list) != 1:
            return await self.selection_handler(ctx, combined_list)
        if unsure and combined_list[0][1] < 95:  # entries score
            await self.unsure_select_handler(ctx, combined_list[0][0])
        return combined_list[0][0]  # actual entry itself
Example #18
0
 def search(query: str, result_size: int = 5) -> List[NoteInDB]:
     all_notes_dict = {note.doc_id: note.content for note in Notes.all()}
     search_results = process.extract(query,
                                      choices=all_notes_dict,
                                      scorer=fuzz.token_set_ratio,
                                      limit=result_size,
                                      score_cutoff=20)
     return [
         Notes.get_by_id(res_record[2]) for res_record in search_results
     ]
Example #19
0
 def rank(self, target, searches, limit=10):
     matches = process.extract(target,
                               searches.keys(),
                               limit=limit,
                               scorer=fuzz.partial_ratio)
     matches = [(m[0], m[1] * math.log(searches[m[0]] + 1)) for m in matches
                if m[1] > 0]
     if matches:
         return [m[0] for m in sorted(matches, key=lambda d: -d[1])]
     return [target]
Example #20
0
    def _handle_not_found(self, query: str) -> None:
        """
        Handles when a query does not match a valid command or cog.

        Will pass on possible close matches along with the `HelpQueryNotFound` exception.
        """
        # Combine command and cog names
        choices = list(self._bot.all_commands) + list(self._bot.cogs)

        result = process.extract(query, choices, score_cutoff=90)

        raise HelpQueryNotFound(f'Query "{query}" not found.', dict(result))
def fuzzy_find(matches, urls):
    choices = {}
    for match in matches:
        for key, value in process.extract(match, urls, limit=None):
            choices.setdefault(key, 0)
            choices[key] += value
    choices = sorted([(v, k) for k, v in choices.items()], reverse=True)
    if not choices: return []
    elif len(choices) == 1: return [choices[0][1]]
    elif choices[0][0] > choices[1][0]: choices = choices[:1]
    else: choices = list(takewhile(lambda t: t[0] == choices[0][0], choices))
    return [v for k, v in choices]
Example #22
0
    def check_consistency(cls, values):
        """
        Validate and auto complete unit data based on the UN/CEFACT data
        Args:
            values (dict): Values of a all data fields

        Returns:
            values (dict): Validated data
        """
        units = load_units()
        name = values.get("name")
        code = values.get("code")

        if isinstance(code, UnitCode):
            code = code.value
        if isinstance(name, UnitText):
            name = name.value

        if code and name:
            idx = units.index[((units.CommonCode == code) &
                               (units.Name == name))]
            if idx.empty:
                raise ValueError("Invalid combination of 'code' and 'name': ",
                                 code, name)
        elif code:
            idx = units.index[(units.CommonCode == code)]
            if idx.empty:
                raise ValueError("Invalid 'code': ", code)
        elif name:
            idx = units.index[(units.Name == name)]
            if idx.empty:
                names = units.Name.tolist()
                suggestions = [item[0] for item in process.extract(
                    query=name.casefold(),
                    choices=names,
                    score_cutoff=50,
                    limit=5)]

                raise ValueError(f"Invalid 'name' for unit! '{name}' \n "
                                 f"Did you mean one of the following? \n "
                                 f"{suggestions}")
        else:
            raise AssertionError("'name' or 'code' must be  provided!")

        values["code"] = UnitCode(value=units.CommonCode[idx[0]]).value
        values["name"] = UnitText(value=units.Name[idx[0]]).value
        values["symbol"] = units.Symbol[idx[0]]
        values["conversion_factor"] = units.ConversionFactor[idx[0]]
        if not values.get("description"):
            values["description"] = units.Description[idx[0]]
        return values
Example #23
0
    def testWithScorer(self):
        choices = [
            "new york mets vs chicago cubs", "chicago cubs at new york mets",
            "atlanta braves vs pittsbugh pirates",
            "new york yankees vs boston red sox"
        ]

        choices_mapping = {
            1: "new york mets vs chicago cubs",
            2: "chicago cubs at new york mets",
            3: "atlanta braves vs pittsbugh pirates",
            4: "new york yankees vs boston red sox"
        }

        # in this hypothetical example we care about ordering, so we use quick ratio
        query = "new york mets at chicago cubs"

        # first, as an example, the normal way would select the "more 'complete' match of choices[1]"
        best = process.extractOne(query, choices)
        self.assertEqual(best[0], choices[1])
        best = process.extract(query, choices)[0]
        self.assertEqual(best[0], choices[1])
        # dict
        best = process.extractOne(query, choices_mapping)
        self.assertEqual(best[0], choices_mapping[2])
        best = process.extract(query, choices_mapping)[0]
        self.assertEqual(best[0], choices_mapping[2])

        # now, use the custom scorer
        best = process.extractOne(query, choices, scorer=fuzz.QRatio)
        self.assertEqual(best[0], choices[0])
        best = process.extract(query, choices, scorer=fuzz.QRatio)[0]
        self.assertEqual(best[0], choices[0])
        # dict
        best = process.extractOne(query, choices_mapping, scorer=fuzz.QRatio)
        self.assertEqual(best[0], choices_mapping[1])
        best = process.extract(query, choices_mapping, scorer=fuzz.QRatio)[0]
        self.assertEqual(best[0], choices_mapping[1])
Example #24
0
 def ner2slot(self, input_entity, slot):
     # Given named entity return normalized slot value
     if isinstance(input_entity, list):
         input_entity = ' '.join(input_entity)
     entities = []
     normalized_slot_vals = []
     for entity_name in self._slot_vals[slot]:
         for entity in self._slot_vals[slot][entity_name]:
             entities.append(entity)
             normalized_slot_vals.append(entity_name)
     best_match, score = process.extract(input_entity,
                                         entities,
                                         limit=2**20)[0]
     return normalized_slot_vals[entities.index(best_match)], score
Example #25
0
def get_fuzzy_accounts(identifier, accounts):
    """Attempts to fuzz identifier and provides suggestions.

    :param identifier: identifier to fuzz
    :param accounts: list of accounts to compare
    :return: tuple. Possible accounts that could match.
    """
    # collect all possibilities
    choices = []
    for a in accounts:
        choices.append(a["name"])
        choices += a["aliases"]

    return process.extract(identifier, choices, limit=3)
Example #26
0
    def validate_text(cls, value):
        units = load_units()

        if len(units.loc[(units.Name.str.casefold() == value.casefold())]) >= 1:
            return value
        names = units.Name.tolist()
        suggestions = [item[0] for item in process.extract(
            query=value.casefold(),
            choices=names,
            score_cutoff=50,
            limit=5)]
        raise ValueError(f"Invalid 'name' for unit! '{value}' \n "
                         f"Did you mean one of the following? \n "
                         f"{suggestions}")
Example #27
0
def part2(puzzle_input: list) -> str:
    for teststr in puzzle_input:
        # Get closest match, ignore self match
        test = process.extract(teststr, puzzle_input, limit=2)[1]

        # Match should differ by 1 character, calculate this percentage and use as a threshold
        diff_threshold = floor(((len(teststr) - 1) / len(teststr)) * 100)

        if test[1] >= diff_threshold:
            a = list(test[0])
            b = list(teststr)

            for idx, letter in enumerate(a):
                if b[idx] != letter:
                    b.pop(idx)
                    return "".join(b)
Example #28
0
    async def order(self, keywords, queue):
        if keywords[0] != "td":
            return
        kw = " ".join(keywords[1:])
        if kw == "":
            return
        from rapidfuzz import fuzz, process

        for _, score, idx in process.extract(
                kw,
                self.for_compare,
                limit=self.num_candidates,
                scorer=fuzz.partial_token_sort_ratio,
                score_cutoff=self.score_cutoff,
        ):
            queue.put(TeXPdfFile(self.files[idx], score + 50))
Example #29
0
    def get_fuzzy_role(self, ctx, name: str):
        result = []
        for r in process.extract(
                name,
            {r: unidecode(r.name)
             for r in ctx.guild.roles},
                limit=None,
                score_cutoff=75,
        ):
            result.append((r[2], r[1]))

        if not result:
            raise BadArgument("{} is not a valid role")

        sorted_result = sorted(result, key=lambda r: r[1], reverse=True)
        return sorted_result[0][0]
Example #30
0
    def Match(self, query: str):
        logger.info(f"Matching {query}")

        actions = self.runners_manager.actions()

        results = process.extract(query, [x.key for x in actions],
                                  scorer=fuzz.token_sort_ratio,
                                  score_cutoff=25)
        logger.info(f"Search results: {results}")

        output = []
        for result_key, score, _ in results:
            output.append(
                self.runners_manager.get_action_by_key(
                    result_key).to_dbus_tuple(score))
        return output