コード例 #1
0
ファイル: polyfuzz.py プロジェクト: maybeee18/PolyFuzz
    def _create_groups(self,
                       name: str,
                       model: BaseMatcher,
                       link_min_similarity: float,
                       group_all_strings: bool):
        """ Create groups based on either the To mappings if you compare two different lists of strings, or
        the From mappings if you compare lists of strings that are equal (set group_all_strings to True)
        """

        if group_all_strings:
            strings = list(self.matches[name].From.dropna().unique())
        else:
            strings = list(self.matches[name].To.dropna().unique())

        # Create clusters
        matches = model.match(strings, strings)
        clusters, cluster_id_map, cluster_name_map = single_linkage(matches, link_min_similarity)

        # Map the `to` list to groups
        df = self.matches[name]
        df["Group"] = df['To'].map(cluster_name_map).fillna(df['To'])
        self.matches[name] = df

        # Track clusters and their ids
        self.clusters[name] = clusters
        self.cluster_mappings[name] = cluster_id_map
コード例 #2
0
ファイル: polyfuzz.py プロジェクト: yashugupta786/PolyFuzz
    def group(self,
              model: Union[str, BaseMatcher] = None,
              link_min_similarity: float = 0.75):
        """ From the matches, group the `To` matches together using single linkage

         Arguments:
             model: you can choose one of the models in `polyfuzz.models` to be used as a grouper
             link_min_similarity: the minimum similarity between strings before they are grouped
                                  in a single linkage fashion

         Updates:
            self.matches: Adds a column `Group` that is the grouped version of the `To` column
         """
        check_matches(self)
        self.clusters = {}
        self.cluster_mappings = {}

        # Standard models - quick access
        if isinstance(model, str):
            if model in ["TF-IDF", "TFIDF"]:
                model = TFIDF(n_gram_range=(3, 3),
                              min_similarity=link_min_similarity)
            elif self.method in ["EditDistance", "Edit Distance"]:
                model = RapidFuzz()
            elif self.method in ["Embeddings", "Embedding"]:
                model = Embeddings(min_similarity=link_min_similarity)
            else:
                raise ValueError(
                    "Please instantiate the model with one of the following methods: \n"
                    "* 'TF-IDF'\n"
                    "* 'EditDistance'\n"
                    "* 'Embeddings'\n"
                    "* Or None if you want to automatically use TF-IDF")

        # Use TF-IDF if no model is specified
        elif not model:
            model = TFIDF(n_gram_range=(3, 3),
                          min_similarity=link_min_similarity)

        for name, match in self.matches.items():
            strings = list(self.matches[name].To.dropna().unique())
            matches = model.match(strings, strings)
            clusters, cluster_id_map, cluster_name_map = single_linkage(
                matches, link_min_similarity)
            self._map_groups(name, cluster_name_map)
            self.clusters[name] = clusters
            self.cluster_mappings[name] = cluster_id_map
コード例 #3
0
def test_linkage(min_similarity):
    clusters, cluster_mapping, cluster_name_map = single_linkage(
        matches, min_similarity)

    assert isinstance(clusters, dict)
    assert isinstance(cluster_mapping, dict)
    assert isinstance(cluster_name_map, dict)

    if min_similarity == 1.:
        assert clusters == {}
        assert cluster_mapping == {}
        assert cluster_name_map == {}

    elif min_similarity >= 0.8:
        assert max(cluster_mapping.values()) == 1
        assert len(cluster_name_map) == 2

    else:
        assert max(cluster_mapping.values()) > 1
        assert len(cluster_name_map) == 3