def _create_groups(self, name: str, model: BaseMatcher, link_min_similarity: float, group_all_strings: bool): """ Create groups based on either the To mappings if you compare two different lists of strings, or the From mappings if you compare lists of strings that are equal (set group_all_strings to True) """ if group_all_strings: strings = list(self.matches[name].From.dropna().unique()) else: strings = list(self.matches[name].To.dropna().unique()) # Create clusters matches = model.match(strings, strings) clusters, cluster_id_map, cluster_name_map = single_linkage(matches, link_min_similarity) # Map the `to` list to groups df = self.matches[name] df["Group"] = df['To'].map(cluster_name_map).fillna(df['To']) self.matches[name] = df # Track clusters and their ids self.clusters[name] = clusters self.cluster_mappings[name] = cluster_id_map
def group(self, model: Union[str, BaseMatcher] = None, link_min_similarity: float = 0.75): """ From the matches, group the `To` matches together using single linkage Arguments: model: you can choose one of the models in `polyfuzz.models` to be used as a grouper link_min_similarity: the minimum similarity between strings before they are grouped in a single linkage fashion Updates: self.matches: Adds a column `Group` that is the grouped version of the `To` column """ check_matches(self) self.clusters = {} self.cluster_mappings = {} # Standard models - quick access if isinstance(model, str): if model in ["TF-IDF", "TFIDF"]: model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity) elif self.method in ["EditDistance", "Edit Distance"]: model = RapidFuzz() elif self.method in ["Embeddings", "Embedding"]: model = Embeddings(min_similarity=link_min_similarity) else: raise ValueError( "Please instantiate the model with one of the following methods: \n" "* 'TF-IDF'\n" "* 'EditDistance'\n" "* 'Embeddings'\n" "* Or None if you want to automatically use TF-IDF") # Use TF-IDF if no model is specified elif not model: model = TFIDF(n_gram_range=(3, 3), min_similarity=link_min_similarity) for name, match in self.matches.items(): strings = list(self.matches[name].To.dropna().unique()) matches = model.match(strings, strings) clusters, cluster_id_map, cluster_name_map = single_linkage( matches, link_min_similarity) self._map_groups(name, cluster_name_map) self.clusters[name] = clusters self.cluster_mappings[name] = cluster_id_map
def test_linkage(min_similarity): clusters, cluster_mapping, cluster_name_map = single_linkage( matches, min_similarity) assert isinstance(clusters, dict) assert isinstance(cluster_mapping, dict) assert isinstance(cluster_name_map, dict) if min_similarity == 1.: assert clusters == {} assert cluster_mapping == {} assert cluster_name_map == {} elif min_similarity >= 0.8: assert max(cluster_mapping.values()) == 1 assert len(cluster_name_map) == 2 else: assert max(cluster_mapping.values()) > 1 assert len(cluster_name_map) == 3