class Cluster(object): """Clusters sets with Jaccard similarity above threshold with high probability. Algorithm based on Rajaraman, "Mining of Massive Datasets": 1. Generate set signature 2. Use LSH to map similar signatures to same buckets 3. Use UnionFind to merge buckets containing same values """ def __init__(self, width=10, threshold=0.5): self.width = width self.unionfind = UnionFind() self.signer = MinHashSignature(width) self.hasher = LSH(width, threshold) self.hashmaps = [defaultdict(list) for _ in range(self.hasher.get_n_bands())] def add_set(self, s, label=None): # A label for this set if not label: label = s # Add to unionfind structure self.unionfind[label] # Get signature sig = self.signer.sign(s) # Union labels with same LSH key in same band for band_idx, hshval in enumerate(self.hasher.hash(sig)): self.hashmaps[band_idx][hshval].append(label) self.unionfind.union(label, self.hashmaps[band_idx][hshval][0]) def get_sets(self): return self.unionfind.sets()
def merge(): """Merge categories.""" global mg_ops # dirty :( cats = list(cat_arts.keys()) uf = UnionFind(cats) ncats = len(cat_arts) for i in range(0, ncats): for j in range(i + 1, ncats): cat1, cat2 = cats[i], cats[j] if jaccard(cat_arts[cat1], cat_arts[cat2]) > args.threshold: uf.union([cat1, cat2]) sets = uf.sets() for group in sets: mg_ops += len(group) - 1 size = 0 parent = None for cat in group: l = len(cat_arts[cat]) if l > size: size = l parent = cat if random.random() >= args.handicap: for cat in group: if cat != parent: logging.info("MERGE: %s -> %s" % (cat, parent)) skill_counts.decr(cat_arts[cat] & cat_arts[parent]) cat_arts[parent] |= cat_arts[cat] del cat_arts[cat] else: logging.info("HANDICAP: Skipping merge of %s -> %s" % (cat, parent))
class Cluster(object): """Clusters sets with Jaccard similarity above threshold with high probability. Algorithm based on Rajaraman, "Mining of Massive Datasets": 1. Generate set signature 2. Use LSH to map similar signatures to same buckets 3. Use UnionFind to merge buckets containing same values """ def __init__(self, minHashLen=13, numRowsInBucket=2, threshold=None): self.unionfind = UnionFind() self.signer = MinHashSignature(minHashLen) self.hasher = LSH(minHashLen, numRowsInBucket, threshold) self.hashmaps = [defaultdict(list) for _ in range(self.hasher.get_n_bands())] self.lshmap = {} def add_set(self, s, label=None): # A label for this set if not label: label = s # Add to unionfind structure self.unionfind[label] # Get signature sig = self.signer.sign(s) # Union labels with same LSH key in same band lshKeys = self.hasher.hash(sig) self.lshmap[label] = [] for band_idx, hshval in enumerate(lshKeys): #print "Got band_idx, hashval: " + str(band_idx) + "," + str(hshval) self.hashmaps[band_idx][hshval].append(label) self.unionfind.union(label, self.hashmaps[band_idx][hshval][0]) self.lshmap[label].append(hshval) def get_clusters(self, min_cluster_len): for band_idx in range(0,len(self.hashmaps)): #print "clusters>Got band_idx: " + str(band_idx) hashmap = self.hashmaps[band_idx] for key in hashmap: list = hashmap[key] if(len(list) > min_cluster_len): yield list def get_clusters_with_hashes(self, min_cluster_len): for band_idx in range(0,len(self.hashmaps)): hashmap = self.hashmaps[band_idx] for key in hashmap: list = hashmap[key] if(len(list) > min_cluster_len): list2 = [] for label in list: if self.lshmap[label]: list2.append((label, self.lshmap[label])) else: list2.append(label) yield list2 def get_cluster_unions(self, min_cluster_len): x = self.unionfind.sets() for set in x: if len(set) > min_cluster_len: yield set def get_min_hash(self, object): return list(self.signer.sign(object)) def get_lsh_hash(self, object): sig = self.signer.sign(object) return list(self.hasher.hash(sig))