def merge(): """Merge categories.""" global mg_ops # dirty :( cats = list(cat_arts.keys()) uf = UnionFind(cats) ncats = len(cat_arts) for i in range(0, ncats): for j in range(i + 1, ncats): cat1, cat2 = cats[i], cats[j] if jaccard(cat_arts[cat1], cat_arts[cat2]) > args.threshold: uf.union([cat1, cat2]) sets = uf.sets() for group in sets: mg_ops += len(group) - 1 size = 0 parent = None for cat in group: l = len(cat_arts[cat]) if l > size: size = l parent = cat if random.random() >= args.handicap: for cat in group: if cat != parent: logging.info("MERGE: %s -> %s" % (cat, parent)) skill_counts.decr(cat_arts[cat] & cat_arts[parent]) cat_arts[parent] |= cat_arts[cat] del cat_arts[cat] else: logging.info("HANDICAP: Skipping merge of %s -> %s" % (cat, parent))
class Tree: def __init__(self, directed=False, weighted=False): self.directed = directed self.weighted = weighted self.tree = {} self.vertex_num = 0 self.vertex = [] self.components = UnionFind() def add_edge(self, origin, destiny, weight=0): def add_vertex(self, vertex): if not vertex in self.tree.keys(): self.components.add(vertex) self.tree[vertex] = {} self.vertex_num += 1 self.vertex.append(vertex) if not origin in self.tree.keys(): add_vertex(self, origin) if not destiny in self.tree.keys(): add_vertex(self, destiny) if self.components.connected(origin, destiny): raise Exception("Cannot add edge, would create a cicle") self.tree[origin][destiny] = weight if not self.directed: self.tree[destiny][origin] = weight self.components.union(origin, destiny)
def connected_components_diff(game, player): """ Difference between number of connected components of one player and its opponent :param game: :param player: :return: """ size = game.width * game.height uf = UnionFind(size) blank = game.get_blank_spaces() for bs in blank: for n in neighbors(game, bs): uf.union(bs, n) player_location = game.get_player_location(player) opp_location = game.get_player_location(game.get_opponent(player)) for n in neighbors(game, player_location): uf.union(n, player_location) for n in neighbors(game, opp_location): uf.union(n, opp_location) pl_score = float(uf.components(player_location)) op_score = float(uf.components(opp_location)) return pl_score - op_score
def __init__(self): self.alpha = {} self.input = None nodei = 0 with open('clustering.txt') as f: self.input = [int(i) for i in f.readline().rstrip().split(' ')] for row in f: if row.isspace(): continue hammi = int(row.rstrip().replace(' ', ''), 2) if self.alpha.get(hammi, None) is None: self.alpha[hammi] = set([nodei]) else: self.alpha[hammi].add(nodei) nodei += 1 uf = UnionFind([i for i in range(self.input[0])]) masks = [0] mask_1 = [1 << i for i in range(self.input[1])] for i in it.combinations(mask_1, 2): masks.append(i[0] ^ i[1]) masks.extend(mask_1) for key in self.alpha: for di in masks: if self.alpha.get(di ^ key, None) is not None: temp_ = self.alpha[key].union(self.alpha[di ^ key]) leader = temp_.pop() for tempi in temp_: uf.union(leader, tempi) print(uf.n_comps, uf.n_elts)
class Cluster(object): """Clusters sets with Jaccard similarity above threshold with high probability. Algorithm based on Rajaraman, "Mining of Massive Datasets": 1. Generate set signature 2. Use LSH to map similar signatures to same buckets 3. Use UnionFind to merge buckets containing same values """ def __init__(self, width=10, threshold=0.5): self.width = width self.unionfind = UnionFind() self.signer = MinHashSignature(width) self.hasher = LSH(width, threshold) self.hashmaps = [defaultdict(list) for _ in range(self.hasher.get_n_bands())] def add_set(self, s, label=None): # A label for this set if not label: label = s # Add to unionfind structure self.unionfind[label] # Get signature sig = self.signer.sign(s) # Union labels with same LSH key in same band for band_idx, hshval in enumerate(self.hasher.hash(sig)): self.hashmaps[band_idx][hshval].append(label) self.unionfind.union(label, self.hashmaps[band_idx][hshval][0]) def get_sets(self): return self.unionfind.sets()
def simplify(self): p = copy.deepcopy(self.productions) uf = UnionFind(list(p.keys())) while True: changed = False subs = {} to_add = {} to_delete = set() for k1, r1 in p.items(): for k2, r2 in p.items(): if k1 != k2 and k1 not in subs and k2 not in subs and r1 == r2: subs[k2] = k1 to_add[k1] = r2 to_delete.add(k2) uf.union(k1, k2) changed = True p = {**p, **to_add} for k in to_delete: del p[k] if not changed: break else: p = {k2: r2.substitute(subs) for k2, r2 in p.items()} return Grammar(productions=p), {k: uf.component(k) for k in p.keys()}
def equations_possible_using_union_find(equations: List[str]) -> bool: uf = UnionFind() for leftvar, op, _, rightvar in equations: if op == '=': uf.union(leftvar, rightvar) return not any(op == '!' and uf.is_connect(l, r) for l, op, _, r in equations)
def __init__(self, width=10, threshold=0.5): self.width = width self.unionfind = UnionFind() self.signer = MinHashSignature(width) self.hasher = LSH(width, threshold) self.hashmaps = [defaultdict(list) for _ in range(self.hasher.get_n_bands())]
def combine_boxes(boxes, img_dim, dist_threshold=15, padding=0): """Uses UnionFind to group close-by contours into boxes (disjoint connected components). Parameters: boxes (list(numpy array)): List of numpy arrays of coordinates of a box as returned by enclosing_box. img_dim (tuple(int)): Tuple of image dimensions. dis_threshold (int): Threshold of number of pixels to determine whether boxes are too close. padding (int): Number of pixels that are padding. Return: (numpy array): Numpy array of combined boxes. """ n = len(boxes) uf = UnionFind(n) for i, j in combinations(range(n), 2): if closest_distance(boxes[i], boxes[j]) < dist_threshold: uf.union(i, j) box_groups = [[box for i in group for box in boxes[i]] for group in uf.groups()] combined_boxes = [ enclosing_box(group, img_dim, padding=padding) for group in box_groups ] filtered_boxes = [x for x in combined_boxes if fits_criteria_box(x)] return np.array(filtered_boxes)
def kruskal(edges): """Calculate the cost and edges of a spanning tree, given the edges of a graph. Complexity: O(|E| log |E|) """ # Find all the nodes and create a heap of edges - O(|E|) all_nodes = set() hq = [] for i, j, w in edges: all_nodes.add(i) all_nodes.add(j) heappush(hq, (w, i, j)) # Initialize the forest - O(|V|) forest = UnionFind(all_nodes) # Initialize the tree's data structure - O(1) tree_edges = [] cost = 0 # Calculate the minimum spanning tree - O(|E| log |E|) while hq: cost_incr, n1, n2 = heappop(hq) if forest.is_same_set(n1, n2): continue cost += cost_incr forest.union(n1, n2) tree_edges.append((n1, n2)) return (tree_edges, cost)
def flip_inside_O_by_UF(board: List[List[str]]) -> None: if not board: board = [] return uf = UnionFind(board) dummyIdx = uf.r * uf.c for i in range(uf.r): for j in range(uf.c): if board[i][j] == 'O': ijIdx = i * uf.c + j # connect all boundary nodes to the ONE dummy node if i in (0, uf.r - 1) or j in (0, uf.c - 1): uf.union(ijIdx, dummyIdx) # check its right neighbor is connected ? if j + 1 < uf.c and board[i][j + 1] == 'O': uf.union(ijIdx, ijIdx + 1) # check its down neighbor is connected ? if i + 1 < uf.r and board[i + 1][j] == 'O': uf.union(ijIdx, ijIdx + uf.c) for i in range(uf.r): for j in range(uf.c): if uf.find_root(i * uf.c + j) != uf.find_root(uf.dummyIdx): board[i][j] = 'X'
def __init__(self, directed=False, weighted=False): self.directed = directed self.weighted = weighted self.tree = {} self.vertex_num = 0 self.vertex = [] self.components = UnionFind()
def define_shells_structure(me): union_find = UnionFind(len(me.vertices)) [ union_find.unify(v1, v2) for v1, v2 in [(edge.vertices) for edge in me.edges] ] return union_find
def find_circle_num_using_union_find(M: List[List[int]]) -> int: uf = UnionFind(len(M)) for i in range(uf.num): # ignore upper-right triangle in matrix for j in range(0, i): if M[i][j] == 1: uf.union(i, j) return len([1 for i, v in enumerate(uf.parent) if i == v])
def UnionFindCommunity(self, G): Nodes = G.nodes() uf = UnionFind(Nodes) for source, target in G.edges(): uf.union(source, target) components = uf.components() score = [] for nodes in components: score.append(nodes) self.addGNodesAttr(G, score, "Union find")
def trip_roster_merged(trip_roster_file, colname_file, trip_chain, park_pair_file, gas_pair_file): col_names = pd.read_csv(colname_file) trip_roster = pd.read_csv(trip_roster_file, header=None, names=col_names.columns) if trip_chain == False: matched_trip_pair = pd.read_csv(park_pair_file) matched_trip_pair.columns = [ 'TripId', 'StopId', 'EndTripId', 'StartTripId' ] matched_trip_pair = matched_trip_pair[[ 'EndTripId', 'StartTripId', 'StopId' ]].append(pd.read_csv(gas_pair_file)) trip_pair_id = ['EndTripId', 'StopId', 'StartTripId'] new_pair_df = matched_trip_pair else: matched_trip_pair = pd.read_csv(park_pair_file, usecols=['end', 'start']) matched_trip_pair.columns = ['EndTripId', 'StartTripId'] matched_trip_pair = matched_trip_pair.append( pd.read_csv(gas_pair_file, usecols=['EndTripId', 'StartTripId'])) trip_pair_id = ['EndTripId', 'StartTripId'] # trip chaining start_time = time.time() uf = UnionFind(list(set(matched_trip_pair.values.flatten()))) for index, row in matched_trip_pair.iterrows(): uf.union(row['EndTripId'], row['StartTripId']) result = uf.components() print('Trip chaining takes %s secs for %s trip pairs.' % (time.time() - start_time, len(matched_trip_pair))) def set_first_last(input_set): tmp = list(input_set) return [tmp[0], tmp[-1]] new_pair = map(set_first_last, result) new_pair_df = pd.DataFrame(new_pair, columns=['EndTripId', 'StartTripId']) trip_unmatched = trip_roster.loc[~trip_roster['TripId'].isin( matched_trip_pair[['EndTripId', 'StartTripId']].values.flatten())] # create od file for matched trips/ trip chain trip_od = new_pair_df.merge( trip_roster[['TripId', 'StartLocLat', 'StartLocLon']].rename(columns={'TripId': 'EndTripId'}), how='left', sort=False) trip_od = trip_od.merge( trip_roster[['TripId', 'EndLocLat', 'EndLocLon']].rename(columns={'TripId': 'StartTripId'}), how='left', sort=False) trip_od['TripId'] = trip_od[trip_pair_id].apply( lambda row: '_'.join(row.tolist()), axis=1) return trip_unmatched, trip_od
def clustering(edge_list, count_nodes, clusters): u = UnionFind([x+1 for x in range(count_nodes)]) count_edges = len(edge_list) i = 0 while True: if not u.find(edge_list[i][1][0]) == u.find(edge_list[i][1][1]): if count_nodes <= clusters: return edge_list[i][0], u u.union(edge_list[i][1][0], edge_list[i][1][1]) count_nodes -= 1 i += 1
def rm_stones_using_union_find(stones: List[List[int]]) -> int: if not stones: return 0 uf = UnionFind() for x, y in stones: for i, j in stones: if x == i or y == j: uf.union(10000 * x + y, 10000 * i + j) return len(uf.p) - len({uf.find_root(10000 * x + y) for x, y in stones})
def kruskal(g, w): uf = UnionFind(g.keys()) w = sorted(w, key=lambda x: w[x]) edges = list() for edge in w: a, b = edge if not uf.connected(a, b): uf.union(a, b) edges.append(edge) if len(edges) == len(g) - 1: break return edges
def _transitive_closure(self): self.uf = UnionFind(np.arange(self.n)) for link in self.ml: self.uf.union(link[0], link[1]) self.chunklets = np.array( [np.array(list(i)) for i in self.uf.components()]) self.n_chunklets = self.chunklets.shape[0] self.chunklet_shapes = np.array([i.shape[0] for i in self.chunklets]) self.chunklet_shapes = self.chunklet_shapes.reshape(-1, 1) self.chunklet_means = np.array( [np.mean(self.X[i], axis=0) for i in self.chunklets]) assert self.chunklet_means.shape == (self.n_chunklets, self.d)
def findCircleNum(self, M): """ :type M: List[List[int]] :rtype: int """ length =len(M) uf = UnionFind(length) for i in range(length): for j in range(i): if M[i][j] == 1: uf.add(i,j) return uf.componentsNum()
def mspEdges(self, heuristicFn): A = [] N = len(self.nodes) uf = UnionFind(N) weights = self.generate_weights(heuristicFn) order = argsort(weights) for i in order: (u, v) = self.edges[i] if not uf.connected(u, v): A.append((u, v)) uf.union(u, v) return A
def mspEdges(self, heuristicFn): A = [] N = len(self.nodes) uf = UnionFind(N) weights = self.generate_weights(heuristicFn) order = argsort(weights) for i in order: (u,v) = self.edges[i] if not uf.connected(u,v): A.append((u,v)) uf.union(u,v) return A
def longest_consecutive_by_uf(nums: List[int]) -> int: if not nums: return 0 uf = UnionFind(nums) valToIdx = {nums[i]: i for i in range(len(nums))} for v in valToIdx.keys(): # When get value == 0, which is false, # so always check None instead of false, # otherwise may cause union operations is missing if valToIdx.get(v + 1) is not None: uf.union(valToIdx[v], valToIdx[v + 1]) return uf.largest_one_union()
def max_space_clustering(edges, vertices, clusters): union_find = UnionFind() for vertex in vertices: union_find.add_vertex(vertex) edges = sorted(edges, key=(lambda x: x.dist)) e = 0 for edge in edges: rootu = union_find.path_compress_find(edge.u) rootv = union_find.path_compress_find(edge.v) if e == len(vertices) - clusters: if rootu != rootv: max_spacing = edge.dist break else: continue if rootu == rootv: continue else: union_find.union(edge.u, edge.v) e += 1 root_parents = dict() for vertex in list(vertices): root_parents[vertex] = union_find.path_compress_find(vertex) return root_parents, max_spacing
def __init__(self, lemat_dict_file): self.lematizer = Lematizer(lemat_dict_file) self.superbase = UnionFind() lemats = self.lematizer.all_lemats() for l in lemats: self.superbase.make_set(l) for (_, lems) in self.lematizer.items(): sofar = None for l in lems: if sofar: self.superbase.union(sofar, l) sofar = self.superbase.find(l)
def kruskalMST(self): self.generatePQ() uf = UnionFind() uf.WeightedQuickUnionUF(self.N) self.mst = [None] * self.N index = 0 while len(self.PQ) != 0: edge = heapq.heappop(self.PQ) if (uf.connected(edge.u, edge.v)): continue uf.unify(edge.v, edge.u) self.mstCost += edge.cost self.mst[index] = edge index += 1 if uf.size1(0) == self.N: break mstExists = (uf.size1(0) == self.N) solved = True if solved: return self.mstCost else: return None
def kruskal_mst(self): assert self.is_connected(), "Can only find MST of a connected graph" uf = UnionFind() mst = set() for v in self.vs: uf.make_set(v) half = set() for u, v in sorted(self.es): if (v, u) not in half: half.add((u, v)) w = 0 vs = set() for u, v in sorted(half, key = lambda e : self.weights[e]): if len(vs) == len(self.vs): return mst, w if uf.find_set(u) != uf.find_set(v): uf.union(u, v) mst.add((u, v)) vs.add(u) vs.add(v) w += self.weights[(u, v)]
def accounts_merge_using_union_find( accounts: List[List[str]]) -> List[List[str]]: uf = UnionFind() email_to_name = dict() for account in accounts: name = account[0] for email in account[1:]: email_to_name[email] = name uf.union(email, account[1]) return [[email_to_name[pmail]] + sorted(emails) for (pmail, emails) in uf.groups().items()]
def gen_model(dataset_name): event_data, missing_urls_amount = load_data(dataset_name) ########## _info( "create pairs (t, u) or (t, t') for each tweet t and url u or replied/retweeted tweet t'" ) replies_amount = 0 retweets_amount = 0 quotes_amount = 0 missing_replies_amount = 0 pairs = [] for tweet_id, tweet in event_data.items(): [ pairs.append((tweet_id, url)) for url in tweet.expanded_urls.values() if url ] # retweets ARE considered, due to be exact text copies of the retweeted tweet if tweet.retweet_id != 'NULL': retweets_amount += 1 if tweet.quote_id != 'NULL': quotes_amount += 1 if tweet.reply_id != 'NULL': replies_amount += 1 if tweet.reply_id in event_data: pairs.append((tweet_id, tweet.reply_id)) else: missing_urls_amount += 1 _info( f'total pairs: {len(pairs)}, retweets: {retweets_amount}, quotes: {quotes_amount}, replies: {replies_amount} ' f'(missing: {missing_replies_amount})') ########## """ all keys must be the same time (in this case, strings); unionfind will vectorize operations and will cast everything in the array to the same type, so if there are integers and strings, it will cast everything to string and comparisons will fail when calling uf.components(). """ _info('applying union-find') uf = UnionFind() for u, v in pairs: uf.union(u, v) _info(f'total components: {len(uf.components())}') return uf, event_data
def hammond_distances(file_path): file_stream = open(file_path) line_one = file_stream.readline().split(' ') count_edges, count_bits = int(line_one[0]), int(line_one[1]) uf = UnionFind([]) for i in range(count_edges): code = file_stream.readline() code = code.replace(' ', '').replace('\n', '') uf.add(code) update_singles(uf, code, count_bits) update_doubles(uf, code, count_bits) file_stream.close() clusters = set() for k in uf._node_titles.keys(): clusters.add(uf.find(k)) return len(clusters)
def __init__(self, minHashLen=13, numRowsInBucket=2, threshold=None): self.unionfind = UnionFind() self.signer = MinHashSignature(minHashLen) self.hasher = LSH(minHashLen, numRowsInBucket, threshold) self.hashmaps = [defaultdict(list) for _ in range(self.hasher.get_n_bands())] self.lshmap = {}
def test_add_node(self): u = UnionFind() foo = Node("foo") bar = Node("bar") baz = Node("baz") u.add(foo) u.add(bar) u.add(baz) self.assertEqual(3, len(u.leader)) self.assertEqual(foo, u.leader[foo]) self.assertEqual(bar, u.leader[bar]) self.assertEqual(baz, u.leader[baz]) self.assertEqual(3, len(u.followers)) self.assertEqual(set(), u.followers[foo]) self.assertEqual(set(), u.followers[bar]) self.assertEqual(set(), u.followers[baz])
def __init__(self, cities_data, data_limit: int = 0, print: bool = False): self.cities = {} self._loops = [] self._borders = [] self.loops = UnionFind() limit = data_limit self._print = print df = pd.read_csv(cities_data) upper = len(df) if limit != 0: upper = limit df = df[0:upper] for row in df.iterrows(): city = City(row[1].CityId, row[1].X, row[1].Y) self.cities[city.id] = city self._create_loops()
def Clustering(nodes): UnionNodes = UnionFind(nodes.keys()) for node in tqdm(nodes): for neighbour in getNeiborhood(node): if neighbour not in nodes: continue if not UnionNodes.connected(node, neighbour): UnionNodes.union(node, neighbour) return UnionNodes
def test_union(self): u = UnionFind() foo = Node("foo") u.add(foo) bar = Node("bar") u.add(bar) self.assertEqual(foo, u.find(foo)) self.assertEqual(bar, u.find(bar)) u.union(foo, bar) self.assertEqual(bar, u.find(foo)) self.assertEqual(bar, u.find(bar))
def cluster(vertices, radix): V = frozenset(vertices) u = UnionFind() for v in V: u.add(v) print "Starting with {} clusters".format(u.clusters) i = 0.0 for v in V: if i % 100 == 0: p = 100*( i / len(vertices)) sys.stdout.write("\r%f%% (%d)" % (p, len(V))) sys.stdout.flush() potential = hamming_neighbours(v, radix=radix, dist=2) for p in potential: if p in V: neighbour = p u.union(v, neighbour) i += 1 sys.stdout.write("\n") return u
def mst(self,b_lengths, weights): """Kruskal's algorithm for minimum spanning tree Input: b_lengths: dictionary with keys (node,node) and values branch length weights: dictionary maps (node,node) to negative log likelihood Output: adj_mat: adjacency matrix. dict (node,node) keys weight values """ #go through all the nodes and get rid of parent/child relationships #so we can build the tree again #except keep leaf nodes and observed data for i in range(len(self.nodes)): if len(self.nodes[i].children) > 1: self.nodes[i].parent = None self.nodes[i].parent_weight = None self.nodes[i].children = [] else: self.nodes[i].parent = None self.nodes[i].parent_weight = None #unionfind object d_set = UnionFind() #sort the edges into non-decreasing order edges = [(edge,weight) for edge,weight in sorted(weights.items(),key = lambda x: x[1])] #keep track of new graph as an adjacency matrix adj_mat = {} for edge,weight in edges: u,v = edge if d_set[u] != d_set[v]: d_set.union(u,v) #update adjacency matrix adj_mat[(u,v)] = b_lengths[(u,v)] adj_mat[(v,u)] = b_lengths[(u,v)] return(adj_mat)
def cluster(graph, k): edges = heapify(graph.edges) u = UnionFind() [u.add(node) for node in graph.nodes.values()] while u.clusters > k: cost, edge = heappop(edges) if cycle(u, edge): #print "skipping {}".format(edge) pass else: u.union(u.find(edge.v0), u.find(edge.v1)) mindist = get_mindist(u, edges) return mindist, u.followers
def max_k_clustering(gr, k): sorted_edges = sorted(gr.get_edge_weights()) uf = UnionFind() # initialize each node as its cluster for n in gr.nodes(): uf.insert(n) for (w, (u, v)) in sorted_edges: if uf.count_groups() <= k: return uf.get_sets() if uf.get_leader(u) != uf.get_leader(v): uf.make_union(uf.get_leader(u), uf.get_leader(v))
def __init__(self, width=10, threshold=0.5): self.width = width self.unionfind = UnionFind() self.signer = MinHashSignature(width) self.hasher = LSH(width, threshold) self.hashmap = {}
# g = open(gtm_fname, 'r') # g.close() # except IOError: # gtm_fname = None # if gtm_fname is not None: break # if gtm_fname is None: # sys.stderr.write("Specify a gtm file\n") # exit(1) # sys.stderr.write("using " + gtm_fname + "\n") gtm = GtmFile(gtm_fname) group_count = len(gtm.groups) # find connected components which have to be paths # wanna check if they are paths? uf = UnionFind() for line in f.readlines(): line = line.strip() if line.startswith("# y"): break if line=="" or line.startswith("#"): continue if line.find(',')>=0: line = line.split(',') elif line.find(' ')>=0: line = line.split(' ') else: raise Exception("Invalid line: "+line) if len(line)>=2: u,v = int(line[0]), int(line[1]) else: raise Exception("ERROR line: %s"%line) uf.union(u,v) # make lists of groups (no dummies) in each component vertices = range(1, group_count+1)
def setUp(self): self.uf = UnionFind() self.uf.insert("a", "b") self.uf.insert("b", "c") self.uf.insert("i", "j")
class test_unionfind(unittest.TestCase): def setUp(self): self.uf = UnionFind() self.uf.insert("a", "b") self.uf.insert("b", "c") self.uf.insert("i", "j") def test_get_parent_method(self): self.assertEqual("a", self.uf.get_leader("a")) self.assertEqual("a", self.uf.get_leader("b")) self.assertEqual("a", self.uf.get_leader("c")) self.assertEqual("i", self.uf.get_leader("j")) self.assertEqual("i", self.uf.get_leader("i")) self.assertNotEqual(self.uf.get_leader("a"), self.uf.get_leader("i")) def test_insert_method(self): self.uf.insert("c", "d") self.assertEqual(self.uf.get_leader("c"), self.uf.get_leader("d")) self.assertEqual(self.uf.get_leader("a"), self.uf.get_leader("d")) def test_insert_one_node(self): self.uf.insert('z') self.assertEqual(self.uf.get_leader('z'), 'z') self.assertEqual(self.uf.count_groups(), 3) def test_make_union_method(self): self.uf.make_union(self.uf.get_leader("a"), self.uf.get_leader("i")) self.assertEqual(self.uf.get_leader("a"), self.uf.get_leader("i")) def test_make_union_with_invalid_leader_raises_exception(self): self.assertRaises(Exception, self.uf.make_union, "a", "z") def test_get_count(self): self.uf.insert("z", "y") self.assertEqual(self.uf.count_groups(), 3)
def makeUnionFind(_set,N): uf = UnionFind(N) for i,j in _set: uf.union(i,j) return uf
class Cluster(object): """Clusters sets with Jaccard similarity above threshold with high probability. Algorithm based on Rajaraman, "Mining of Massive Datasets": 1. Generate set signature 2. Use LSH to map similar signatures to same buckets 3. Use UnionFind to merge buckets containing same values """ def __init__(self, minHashLen=13, numRowsInBucket=2, threshold=None): self.unionfind = UnionFind() self.signer = MinHashSignature(minHashLen) self.hasher = LSH(minHashLen, numRowsInBucket, threshold) self.hashmaps = [defaultdict(list) for _ in range(self.hasher.get_n_bands())] self.lshmap = {} def add_set(self, s, label=None): # A label for this set if not label: label = s # Add to unionfind structure self.unionfind[label] # Get signature sig = self.signer.sign(s) # Union labels with same LSH key in same band lshKeys = self.hasher.hash(sig) self.lshmap[label] = [] for band_idx, hshval in enumerate(lshKeys): #print "Got band_idx, hashval: " + str(band_idx) + "," + str(hshval) self.hashmaps[band_idx][hshval].append(label) self.unionfind.union(label, self.hashmaps[band_idx][hshval][0]) self.lshmap[label].append(hshval) def get_clusters(self, min_cluster_len): for band_idx in range(0,len(self.hashmaps)): #print "clusters>Got band_idx: " + str(band_idx) hashmap = self.hashmaps[band_idx] for key in hashmap: list = hashmap[key] if(len(list) > min_cluster_len): yield list def get_clusters_with_hashes(self, min_cluster_len): for band_idx in range(0,len(self.hashmaps)): hashmap = self.hashmaps[band_idx] for key in hashmap: list = hashmap[key] if(len(list) > min_cluster_len): list2 = [] for label in list: if self.lshmap[label]: list2.append((label, self.lshmap[label])) else: list2.append(label) yield list2 def get_cluster_unions(self, min_cluster_len): x = self.unionfind.sets() for set in x: if len(set) > min_cluster_len: yield set def get_min_hash(self, object): return list(self.signer.sign(object)) def get_lsh_hash(self, object): sig = self.signer.sign(object) return list(self.hasher.hash(sig))
def recolor_by_connected_components(self): from unionfind import UnionFind uf = UnionFind() for t in self.gtm.times: for g in self.gtm.time[t]: uf.find(g) for i in self.gtm.inds: uf.find((i,t)) for g in self.gtm.time[t]: for i in self.gtm.group[g]: if self.group_color[g-1]==self.ind_color[i-1][t-1]: uf.union(g, (i,t)) leader = uf.find(g) if t>1: for i in self.gtm.inds: if self.ind_color[i-1][t-1]==self.ind_color[i-1][t-2]: uf.union((i,t-1), (i,t)) leader = uf.find((i,t-1)) new_color = {} for t in self.gtm.times: for g in self.gtm.time[t]: leader = uf.find(g) if leader not in new_color: new_color[leader] = len(new_color)+1 for i in self.gtm.inds: leader = uf.find((i,t)) if leader not in new_color: new_color[leader] = len(new_color)+1 for g in self.gtm.groups: self.group_color[g-1] = new_color[uf.find(g)] for i in self.gtm.inds: for t in self.gtm.times: self.ind_color[i-1][t-1] = new_color[uf.find((i,t))]
def test_find(self): u = UnionFind() foo = Node("foo") u.add(foo) self.assertEqual(foo, u.find(foo))
def maze(w, h, size=2): def conv_size(n): return (n - 1) // size + 1 nw, nh = conv_size(w), conv_size(h) ns = size // 2 - 1 uf = UnionFind(nw * nh) lab = Labyrinth(w, h) for x in range(w): for y in range(h): lab[x, y] = 0 edges = [] for i in range(nh - 1): for j in range(nw - 1): f = flatten(i, j, nw, nh) edges.append((f, f + 1)) # right edges.append((f, f + nw)) # down for i in range(nh - 1): f = flatten(i, nw - 1, nw, nh) edges.append((f, f + nw)) # down for j in range(nw - 1): f = flatten(nh - 1, j, nw, nh) edges.append((f, f + 1)) # right shuffle(edges) while len(uf) > 1: u, v = edges.pop() y1, x1 = unflatten(u, nw, nh) y2, x2 = unflatten(v, nw, nh) if uf.find(u) != uf.find(v): uf.union(u, v) if x2 - x1 == 1: for i in range(size + 1): for j in range(1, ns + 1): ny = size * y1 - j if ny >= 0: lab[size * x1 + i, ny] = True else: break lab[size * x1 + i, size * y1] = True for j in range(1, ns + 1): ny = size * y1 + j if ny < h: lab[size * x1 + i, ny] = True else: break else: for i in range(size + 1): for j in range(1, ns + 1): nx = size * x1 - j if nx >= 0: lab[nx, size * y1 + i] = True else: break lab[size * x1, size * y1 + i] = True for j in range(1, ns + 1): nx = size * x1 + j if nx < w: lab[nx, size * y1 + i] = True else: break lab[0, 0] = 1 lab.start = 0, 0 lab[lab.w - 2, lab.h - 2] = 1 lab.goal = lab.w - 2, lab.h - 2 return lab
class Edge(object): def __init__(self, node1, node2, cost=0, marked=None): self.node1 = node1 self.node2 = node2 self.cost = cost self.marked = marked def __cmp__(self, y): return self.cost - y.cost def __repr__(self): return '<Edge(%s, %s), cost:%s>' % (self.node1, self.node2, self.cost) f = open('./edges.txt', 'r') n_nodes, n2 = f.readline().strip().split() edges = [] for l in f: a, b, c = l.split() edges.append(Edge(str(a), str(b), cost=int(c))) edges = sorted(edges, key=lambda x: x.cost) U = UnionFind() T = [] for e in edges: if U[e.node1] != U[e.node2]: T.append(e) U.union(e.node1, e.node2) print sum(e.cost for e in T)