Esempio n. 1
    def get_ids(self) -> Union[str, List[str], None]:
        """ Get all model ids for easier access """

        if isinstance(self.method, str):
            return self.method
        elif isinstance(self.method, Iterable):
            return [model.model_id for model in self.method]
        return None
Esempio n. 2
    def get_cluster_mappings(self, name: str = None) -> Mapping[str, int]:
        """ Get the mappings from the `To` column to its respective column """

        if len(self.matches) == 1:
            return list(self.cluster_mappings.values())[0]

        elif len(self.matches) > 1 and name:
            return self.cluster_mappings[name]

        return self.cluster_mappings
Esempio n. 3
    def get_matches(self, model_id: str = None) -> Union[pd.DataFrame,
                                                           Mapping[str, pd.DataFrame]]:
        """ Get the matches from one or more models"""

        if len(self.matches) == 1:
            return list(self.matches.values())[0]

        elif len(self.matches) > 1 and model_id:
            return self.matches[model_id]

        return self.matches
Esempio n. 4
    def get_clusters(self, model_id: str = None) -> Mapping[str, List[str]]:
        """ Get the groupings/clusters from a single model
            model_id: the model id of the model if you have specified multiple models

        if len(self.matches) == 1:
            return list(self.clusters.values())[0]

        elif len(self.matches) > 1 and model_id:
            return self.clusters[model_id]

        return self.clusters
Esempio n. 5
    def group(self,
              model: Union[str, BaseMatcher] = None,
              link_min_similarity: float = 0.75):
        """ From the matches, group the `To` matches together using single linkage

             model: you can choose one of the models in `polyfuzz.models` to be used as a grouper
             link_min_similarity: the minimum similarity between strings before they are grouped
                                  in a single linkage fashion

            self.matches: Adds a column `Group` that is the grouped version of the `To` column
        self.clusters = {}
        self.cluster_mappings = {}

        # Standard models - quick access
        if isinstance(model, str):
            if model in ["TF-IDF", "TFIDF"]:
                model = TFIDF(n_gram_range=(3, 3),
            elif self.method in ["EditDistance", "Edit Distance"]:
                model = RapidFuzz()
            elif self.method in ["Embeddings", "Embedding"]:
                model = Embeddings(min_similarity=link_min_similarity)
                raise ValueError(
                    "Please instantiate the model with one of the following methods: \n"
                    "* 'TF-IDF'\n"
                    "* 'EditDistance'\n"
                    "* 'Embeddings'\n"
                    "* Or None if you want to automatically use TF-IDF")

        # Use TF-IDF if no model is specified
        elif not model:
            model = TFIDF(n_gram_range=(3, 3),

        for name, match in self.matches.items():
            strings = list(self.matches[name].To.dropna().unique())
            matches = model.match(strings, strings)
            clusters, cluster_id_map, cluster_name_map = single_linkage(
                matches, link_min_similarity)
            self._map_groups(name, cluster_name_map)
            self.clusters[name] = clusters
            self.cluster_mappings[name] = cluster_id_map
Esempio n. 6
    def group(self,
              model: Union[str, BaseMatcher] = None,
              link_min_similarity: float = 0.75,
              group_all_strings: bool = False):
        """ From the matches, group the `To` matches together using single linkage

             model: you can choose one of the models in `polyfuzz.models` to be used as a grouper
             link_min_similarity: the minimum similarity between strings before they are grouped
                                  in a single linkage fashion
             group_all_strings: if you want to compare a list of strings with itself and then cluster
                                those strings, set this to True. Otherwise, only the strings that
                                were mapped To are clustered.

            self.matches: Adds a column `Group` that is the grouped version of the `To` column
        self.clusters = {}
        self.cluster_mappings = {}

        # Standard models - quick access
        if isinstance(model, str):
            if model in ["TF-IDF", "TFIDF"]:
                model = TFIDF(n_gram_range=(3, 3),
            elif self.method in ["EditDistance", "Edit Distance"]:
                model = RapidFuzz()
            elif self.method in ["Embeddings", "Embedding"]:
                model = Embeddings(min_similarity=link_min_similarity)
                raise ValueError(
                    "Please instantiate the model with one of the following methods: \n"
                    "* 'TF-IDF'\n"
                    "* 'EditDistance'\n"
                    "* 'Embeddings'\n"
                    "* Or None if you want to automatically use TF-IDF")

        # Use TF-IDF if no model is specified
        elif not model:
            model = TFIDF(n_gram_range=(3, 3),

        # Group per model
        for name, match in self.matches.items():
            self._create_groups(name, model, link_min_similarity,
Esempio n. 7
    def visualize_precision_recall(self,
                                   kde: bool = False,
                                   save_path: str = None):
        """ Calculate and visualize precision-recall curves

        A minimum similarity score might be used to identify
        when a match could be considered to be correct. For example,
        we can assume that if a similarity score pass 0.95 we are
        quite confident that the matches are correct. This minimum
        similarity score can be defined as **precision** since it shows
        you how precise we believe the matches are at a minimum.

        **Recall** can then be defined as as the percentage of matches
        found at a certain minimum similarity score. A high recall means
        that for a certain minimum precision score, we find many matches.

            kde: whether to also visualize the kde plot
            save_path: the path to save the resulting image to


        import polyfuzz as pf
        model = pf.PolyFuzz("TF-IDF", model_id="TF-IDF")
        model.match(from_list = ["string_one", "string_two"],
                    to_list = ["string_three", "string_four"])

        self.min_precisions = {}
        self.recalls = {}
        self.average_precisions = {}

        for name, match in self.matches.items():
            min_precision, recall, average_precision = precision_recall_curve(
            self.min_precisions[name] = min_precision
            self.recalls[name] = recall
            self.average_precisions[name] = average_precision

        visualize_precision_recall(self.matches, self.min_precisions,
                                   self.recalls, kde, save_path)