def kruskal(nodes, edges): """ implementation of kruskal's algorithm :param nodes: nodes for input. :param edges: edges for input. :return: edges of the minimum spanning tree. """ # edges of the minimum spanning tree mst = [] # initialize a forest for all nodes forest = DisjointSet(nodes) # sort the edges by their weights edges = sorted(edges, key=lambda x: x[2]) # calculate the number of edges of the minimum spanning tree num_edges = len(nodes) # perform kruskal's algorithm for (src, dst, weight) in edges: # continue if source node or destination node doesn't exist if src not in nodes or dst not in nodes: continue # find the parents of src and dst respectively if forest.unite(src, dst): # add the current edge into the minimum spanning tree if it doesn't make a circuit mst.append((src, dst, weight)) # terminate early if len(mst) == num_edges: break # return the minimum spanning tree return mst
def parallel_prim(sc, nodes, edges, num_partition=4): """ implementation of parallel Prim's algorithm :param nodes: nodes for input. :param edges: edges for input. :param num_partition: number of partitions. :return: """ # edges of the minimum spanning tree mst = [] # initialize a forest for all nodes forest = DisjointSet(nodes) # define function for generating graph def generate_graph(iterator): for edge in iterator: for i in range(2): yield (edge[i], (edge[1 - i], edge[2])) # store the graph in an adjacency list adjacent = sc.parallelize(edges, num_partition) \ .mapPartitions(generate_graph, preservesPartitioning=True) \ .groupByKey(numPartitions=num_partition) \ .mapValues(lambda x: sorted(x, key=lambda y: y[1])) \ .persist() # candidate edges of the global MST candidates = [None] # loop until there is no candidate while len(candidates) != 0: # broadcast the forest to each machine connection = sc.broadcast(forest) # define function for finding minimum edges leaving each disjoint set def find_minimum(iterator): for group in iterator: src = group[0] for (dst, weight) in group[1]: if connection.value.find(src) != connection.value.find( dst): yield (src, dst, weight) if src < dst else (dst, src, weight) break # obtain the list of minimum edges leaving each disjoint set candidates = sorted( adjacent.mapPartitions(find_minimum).distinct().collect(), key=lambda x: x[2]) # calculate the global MST for candidate in candidates: # find the parents of src and dst respectively if forest.unite(candidate[0], candidate[1]): # add the current edge into the minimum spanning tree if it doesn't make a circuit mst.append(candidate) # return the global MST return mst
def eliminate_insiders(components): '''eliminates all components whose bounding boxes lie inside of others. The components object is manipulated in place Args: components: Components instance ''' by_size = by_bbox_size(components) labels = DisjointSet(n_labels=len(by_size)) # pairwise check of bounding boxes. once per pair. for a in range(len(by_size)): for b in range(a + 1, len(by_size)): if is_inside(by_size[a], by_size[b]): labels.unite(a, b) survivors = labels.final_labels() components.chars = [by_size[i] for i in survivors] return
def find_blobs(raw_img, args): '''function performing two dimensional connected component analysis on an image. Args: img (ndarray): original image to be analyzed args (Arguments instance): defined the threshold value to binarze the image Returns: an instance of the Components class, a stencil containing the final labels of components, and a stencil containing the labels before eliminating equivalences ''' # dimensions height = raw_img.shape[0] width = raw_img.shape[1] img = processing.threshold(raw_img, args) # adding column of zeros to prevent left and right most blob # form being mistaken as one zeros = np.zeros((height, 1)) img = np.concatenate((img, zeros), axis=1) width += 1 size = height * width img = img.reshape(size) stencil = np.zeros(size, dtype=int) labels = DisjointSet(n_labels=1) # first pass for i in range(size): if img[i] != 0: # if a neighboring pixel is labeled the investigated pixel is given the same label # Note: when iterating from top left to bottom right indices to the right bottom of investigated # pixel cannot be labeled before this pixel for j in [i - 1, i - width, i - width - 1, i - width + 1]: if j < 0 or j >= size: continue if stencil[j] != 0 and stencil[i] == 0: # connection stencil[i] = stencil[j] elif stencil[j] != 0 and stencil[j] != stencil[i]: # conflict labels.unite(stencil[i], stencil[j]) else: # no connection nor conflict continue # if no neighboring pixel is labeled the investigated pixel is give a new label if stencil[i] == 0: new_label = labels.next() stencil[i] = new_label labels.add(new_label) # uncomment to print show labels after first pass # first_pass = deepcopy(stencil.reshape((height, width))) # second pass to eliminate equivalences eq = labels.get_equivalents() for label in eq.keys(): stencil[stencil == label] = eq[label] # reshaping stencil stencil = stencil.reshape((height, width)) # SCIPY VARIANT #stencil = measure.label(img, background=0) # count pixels in blobs, calculate median to filter blobs final_labels = np.arange(1, np.max(stencil) + 1) pixel_counts = [] for label in final_labels: pixel_counts.append(np.sum(stencil == label)) pixel_counts = np.array(pixel_counts) min_allowed_pixels = np.median( pixel_counts[pixel_counts > 0]) / 5 # arbitrary; seems to work well # filter final lables and stencil final_labels = np.array(final_labels)[pixel_counts >= min_allowed_pixels] new_stencil = np.zeros_like(stencil) for i, label in enumerate(final_labels): new_stencil[stencil == label] = i + 1 stencil = new_stencil # construct boxes around letters bounding_boxes = get_bboxes(stencil) # chars = get_chars_from_boxes(raw, bounding_boxes) # extract characters from image in correct order #chars = [] #bounding_boxes = [] #while boxes: # box = heappop(boxes) # chars.append(raw[box[2]:box[3], box[0]:box[1]]) # bounding_boxes.append(box) return Components(boxes=bounding_boxes, img=raw_img, stencil=stencil)