Exemple #1
0
def kruskal(nodes, edges):
    """
    implementation of kruskal's algorithm

    :param nodes: nodes for input.
    :param edges: edges for input.
    :return: edges of the minimum spanning tree.
    """
    # edges of the minimum spanning tree
    mst = []

    # initialize a forest for all nodes
    forest = DisjointSet(nodes)
    # sort the edges by their weights
    edges = sorted(edges, key=lambda x: x[2])
    # calculate the number of edges of the minimum spanning tree
    num_edges = len(nodes)

    # perform kruskal's algorithm
    for (src, dst, weight) in edges:
        # continue if source node or destination node doesn't exist
        if src not in nodes or dst not in nodes:
            continue
        # find the parents of src and dst respectively
        if forest.unite(src, dst):
            # add the current edge into the minimum spanning tree if it doesn't make a circuit
            mst.append((src, dst, weight))
            # terminate early
            if len(mst) == num_edges:
                break

    # return the minimum spanning tree
    return mst
def parallel_prim(sc, nodes, edges, num_partition=4):
    """
    implementation of parallel Prim's algorithm

    :param nodes: nodes for input.
    :param edges: edges for input.
    :param num_partition: number of partitions.
    :return:
    """
    # edges of the minimum spanning tree
    mst = []

    # initialize a forest for all nodes
    forest = DisjointSet(nodes)

    # define function for generating graph
    def generate_graph(iterator):
        for edge in iterator:
            for i in range(2):
                yield (edge[i], (edge[1 - i], edge[2]))

    # store the graph in an adjacency list
    adjacent = sc.parallelize(edges, num_partition) \
                 .mapPartitions(generate_graph, preservesPartitioning=True) \
                 .groupByKey(numPartitions=num_partition) \
                 .mapValues(lambda x: sorted(x, key=lambda y: y[1])) \
                 .persist()

    # candidate edges of the global MST
    candidates = [None]
    # loop until there is no candidate
    while len(candidates) != 0:
        # broadcast the forest to each machine
        connection = sc.broadcast(forest)

        # define function for finding minimum edges leaving each disjoint set
        def find_minimum(iterator):
            for group in iterator:
                src = group[0]
                for (dst, weight) in group[1]:
                    if connection.value.find(src) != connection.value.find(
                            dst):
                        yield (src, dst, weight) if src < dst else (dst, src,
                                                                    weight)
                        break

        # obtain the list of minimum edges leaving each disjoint set
        candidates = sorted(
            adjacent.mapPartitions(find_minimum).distinct().collect(),
            key=lambda x: x[2])

        # calculate the global MST
        for candidate in candidates:
            # find the parents of src and dst respectively
            if forest.unite(candidate[0], candidate[1]):
                # add the current edge into the minimum spanning tree if it doesn't make a circuit
                mst.append(candidate)

    # return the global MST
    return mst
Exemple #3
0
def eliminate_insiders(components):
    '''eliminates all components whose bounding boxes lie inside of others. The components object is manipulated in place

    Args:
        components: Components instance
    '''

    by_size = by_bbox_size(components)

    labels = DisjointSet(n_labels=len(by_size))

    # pairwise check of bounding boxes. once per pair.
    for a in range(len(by_size)):
        for b in range(a + 1, len(by_size)):
            if is_inside(by_size[a], by_size[b]):
                labels.unite(a, b)

    survivors = labels.final_labels()

    components.chars = [by_size[i] for i in survivors]

    return
def find_blobs(raw_img, args):
    '''function performing two dimensional connected component analysis on an image.

    Args:
        img (ndarray): original image to be analyzed
        args (Arguments instance): defined the threshold value to binarze the image

    Returns:
        an instance of the Components class, a stencil containing the final labels of components,
        and a stencil containing the labels before eliminating equivalences
    '''

    # dimensions
    height = raw_img.shape[0]
    width = raw_img.shape[1]

    img = processing.threshold(raw_img, args)

    # adding column of zeros to prevent left and right most blob
    # form being mistaken as one
    zeros = np.zeros((height, 1))
    img = np.concatenate((img, zeros), axis=1)
    width += 1

    size = height * width
    img = img.reshape(size)
    stencil = np.zeros(size, dtype=int)
    labels = DisjointSet(n_labels=1)

    # first pass
    for i in range(size):

        if img[i] != 0:

            # if a neighboring pixel is labeled the investigated pixel is given the same label
            # Note: when iterating from top left to bottom right indices to the right bottom of investigated
            # pixel cannot be labeled before this pixel
            for j in [i - 1, i - width, i - width - 1, i - width + 1]:

                if j < 0 or j >= size:
                    continue

                if stencil[j] != 0 and stencil[i] == 0:  # connection
                    stencil[i] = stencil[j]

                elif stencil[j] != 0 and stencil[j] != stencil[i]:  # conflict
                    labels.unite(stencil[i], stencil[j])

                else:  # no connection nor conflict
                    continue

            # if no neighboring pixel is labeled the investigated pixel is give a new label
            if stencil[i] == 0:
                new_label = labels.next()
                stencil[i] = new_label
                labels.add(new_label)

    # uncomment to print show labels after first pass
    # first_pass = deepcopy(stencil.reshape((height, width)))

    # second pass to eliminate equivalences
    eq = labels.get_equivalents()
    for label in eq.keys():
        stencil[stencil == label] = eq[label]

    # reshaping stencil
    stencil = stencil.reshape((height, width))
    # SCIPY VARIANT
    #stencil = measure.label(img, background=0)

    # count pixels in blobs, calculate median to filter blobs
    final_labels = np.arange(1, np.max(stencil) + 1)
    pixel_counts = []
    for label in final_labels:
        pixel_counts.append(np.sum(stencil == label))
    pixel_counts = np.array(pixel_counts)
    min_allowed_pixels = np.median(
        pixel_counts[pixel_counts > 0]) / 5  # arbitrary; seems to work well

    # filter final lables and stencil
    final_labels = np.array(final_labels)[pixel_counts >= min_allowed_pixels]
    new_stencil = np.zeros_like(stencil)
    for i, label in enumerate(final_labels):
        new_stencil[stencil == label] = i + 1
    stencil = new_stencil

    # construct boxes around letters
    bounding_boxes = get_bboxes(stencil)
    # chars = get_chars_from_boxes(raw, bounding_boxes)
    # extract characters from image in correct order
    #chars = []
    #bounding_boxes = []
    #while boxes:
    #    box = heappop(boxes)
    #    chars.append(raw[box[2]:box[3], box[0]:box[1]])
    #    bounding_boxes.append(box)
    return Components(boxes=bounding_boxes, img=raw_img, stencil=stencil)