def read_artist_alias(fn): uf = UnionFind() with open(fn, "r") as fh: for line in fh: try: a, b = map(int, line.split("\t")) except ValueError: continue uf.union(a, b) result = {} for s in uf.sets(): sorted_set = sorted(s) head = sorted_set[0] for remaining in sorted_set[1:]: result[remaining] = head return result
def __init__(self, signer=None, sketch_dist_fn=None, max_dist=0, min_support=1, sketch_operator=operator.__and__, sketch_bits=0): self.union_find = UnionFind() self.signer = signer self.buckets = defaultdict(dict) self.sketch_dist_fn = sketch_dist_fn self.sketch_bits = sketch_bits self.max_dist = max_dist self.min_support = min_support self.sketch_operator = sketch_operator
def __init__(self, signer=None, min_support=2): self.union_find = UnionFind() self.signer = signer self.buckets = defaultdict(dict) self.min_support = min_support
class Cluster(object): """Clusters sets with Jaccard similarity above threshold with high probability. Algorithm based on Rajaraman, "Mining of Massive Datasets": 1. Generate set signature 2. Use LSH to map similar signatures to same buckets 3. Use UnionFind to merge buckets containing same values """ def __init__(self, signer=None, min_support=2): self.union_find = UnionFind() self.signer = signer self.buckets = defaultdict(dict) self.min_support = min_support def _closeness_measure(self, sketch=None): min_support = self.min_support return lambda support, sketch: support >= min_support def add_item(self, item, label=None, sketch=None): # Set default label for this set if label is None: label = item # Add to union-find structure union_find = self.union_find union_find.__getitem__(label) # Get signature vector and hash it keys = item \ if self.signer is None \ else self.signer.get_signature(item) # Unite labels with same LSH keys counter = Counter() sketches = dict() for bucket in imap(self.buckets.__getitem__, keys): bucket[label] = sketch counter.update(bucket.keys()) sketches.update(bucket) is_close = self._closeness_measure() for matched_label, support in counter.iteritems(): if matched_label != label and \ is_close(support, sketches[matched_label]): union_find.union(matched_label, label) def add_key(self, key, label=None, sketch=None): """Add one LSH key only (with associated info) Cannot use min_support in this case (it is always equal to one) """ # Set default label for this set if label is None: label = key # Add to union-find structure union_find = self.union_find union_find.__getitem__(label) # Unite labels with same LSH keys bucket = self.buckets[key] bucket[label] = sketch is_close = self._closeness_measure() for matched_label in bucket.keys(): if matched_label != label: matched_sketch = bucket[matched_label] # Note: large improvement in precision when also ensuring that # distance > 0 below: if is_close(matched_sketch, None): union_find.union(matched_label, label) def get_clusters(self): """Returns a list of sets representing clusters :rtype: list """ return self.union_find.sets()
def test_simple_cluster(self): uf = UnionFind() uf.union(0, 1) uf.union(2, 3) uf.union(3, 0) self.assertEqual(uf.sets(), [[0, 1, 2, 3]])