Ejemplo n.º 1
0
def character_regions(img, line_regs, bg_thresh=None, **kwargs):
    """
    Find the characters in an image given the regions of lines if text in the
    image.

    Args:
        img (numpy.ndarray): Grayscaled image.
        line_regs (list[tuple[int, int]]): List of regions representing where
            the lines are.
        bg_thresh (Optional[int]): Background threshold up to which a pixel
            is considered text and not part of the background. If not provided,
            a default background threshold is calculated for each line region
            in the image and used instead.
        **kwargs: Keyword arguments passed to text_regions.
    """
    assert len(img.shape) == 2

    regions = []
    w = img.shape[1]

    for start, end in line_regs:
        sub_img = img[start:end + 1, :]

        if bg_thresh is None:
            bg_thresh = default_background_threshold(sub_img)

        # Sanity check
        assert w == sub_img.shape[1]

        pixels = colored_pixels(sub_img, bg_thresh)
        x_distr, y_distr = zip(*pixels)
        char_regions = text_regions(x_distr, w, **kwargs)
        regions.append(char_regions)

    return regions
Ejemplo n.º 2
0
def get_text_from_regions(img, line_regs, char_regs, classifier, bg_thresh=None,
                          resize=None, **kwargs):
    """
    Try to extract text from an image given the line and character
    regions, and classifier.

    Args:
        img (numpy.ndarray): Image to extract text from. This image does not
            need to be grayscaled since it will be done in this function, but
            the image should be unrotated and be a dark text on a light
            background.
        line_regs (list[tuple[int, int]]): Line regions in image.
        char_regs (list[list[tuple[int, int]]]): Character regions in each
            line in the image.
        classifier (Classifier): Classifier to use to predict characters.
        resize (Optional[tuple[int, int]): Dimensions to resize the image to.
            Defaults to None (indicating we should not resize). If provided,
            it should be a tuple containing the width and height.
            Ex: (width, height)
        **kwargs: Keyword arguments to pass to classifier predict method.

    Returns:
        str: The string extracted from the image.
    """
    avg_char_dist = avg_dist_between_chars(char_regs)
    LOGGER.debug("avg char dist: {}".format(avg_char_dist))

    chars = ""
    default_thresh = bg_thresh
    for i, char_region in enumerate(char_regs):
        starty, endy = line_regs[i]
        last_end = None
        LOGGER.debug("Checking line {}".format(i))
        for startx, endx in char_region:
            sub_img = img[starty:endy+1, startx:endx+1]

            LOGGER.debug("Transforming region between points {} {}"
                         .format((startx, starty), (endx, endy)))

            if default_thresh is None:
                bg_thresh = default_background_threshold(sub_img)
            LOGGER.debug("background threshold: {}".format(bg_thresh))

            if resize is not None:
                sub_img = transform(sub_img, resize[0], resize[1], bg_thresh)
                if sub_img is None:
                    continue
            prediction = classifier.predict([sub_img], **kwargs)[0]
            LOGGER.debug("prediction: {}".format(prediction))

            # Add space if characters are far enough apart
            if last_end is not None:
                dist = startx - last_end
                if dist > avg_char_dist:
                    chars += " "

            chars += prediction
            last_end = endx
        chars += "\n"
    return chars
Ejemplo n.º 3
0
def line_regions(img, bg_thresh=None, **kwargs):
    """
    Find the regions of a grayscale image that contain lines if text.

    Args:
        img (numpy.ndarray): Grayscaled image with dark text on a light
            background.
        bg_thresh (Optional[int]): Background threshold up to which a pixel on
            the image is considered text and not part of the background color.
            If this is not provided (bg_thresh is None), the threshold will be
            the average pixel color of the image minus 2*the std of pixel
            colors, or 0 if this value is negative.
        **kwargs: Keyword arguments passed to text_regions.

    Returns:
        list[tuple[int, int]]: List of regions along the y-axis where lines
            of text in the image start and end.
    """
    if bg_thresh is None:
        bg_thresh = default_background_threshold(img)

    assert bg_thresh >= 0, "bg_thresh cannot be negative."
    assert len(img.shape) == 2

    x_distr, y_distr = zip(*colored_pixels(img, bg_thresh))

    return text_regions(y_distr, img.shape[0], **kwargs)
Ejemplo n.º 4
0
def character_regions(img, line_regs, bg_thresh=None, **kwargs):
    """
    Find the characters in an image given the regions of lines if text in the
    image.

    Args:
        img (numpy.ndarray): Grayscaled image.
        line_regs (list[tuple[int, int]]): List of regions representing where
            the lines are.
        bg_thresh (Optional[int]): Background threshold up to which a pixel
            is considered text and not part of the background. If not provided,
            a default background threshold is calculated for each line region
            in the image and used instead.
        **kwargs: Keyword arguments passed to text_regions.
    """
    assert len(img.shape) == 2

    regions = []
    w = img.shape[1]

    for start, end in line_regs:
        sub_img = img[start:end+1, :]

        if bg_thresh is None:
            bg_thresh = default_background_threshold(sub_img)

        # Sanity check
        assert w == sub_img.shape[1]

        pixels = colored_pixels(sub_img, bg_thresh)
        x_distr, y_distr = zip(*pixels)
        char_regions = text_regions(x_distr, w, **kwargs)
        regions.append(char_regions)

    return regions
Ejemplo n.º 5
0
def line_regions(img, bg_thresh=None, **kwargs):
    """
    Find the regions of a grayscale image that contain lines if text.

    Args:
        img (numpy.ndarray): Grayscaled image with dark text on a light
            background.
        bg_thresh (Optional[int]): Background threshold up to which a pixel on
            the image is considered text and not part of the background color.
            If this is not provided (bg_thresh is None), the threshold will be
            the average pixel color of the image minus 2*the std of pixel
            colors, or 0 if this value is negative.
        **kwargs: Keyword arguments passed to text_regions.

    Returns:
        list[tuple[int, int]]: List of regions along the y-axis where lines
            of text in the image start and end.
    """
    if bg_thresh is None:
        bg_thresh = default_background_threshold(img)

    assert bg_thresh >= 0, "bg_thresh cannot be negative."
    assert len(img.shape) == 2

    x_distr, y_distr = zip(*colored_pixels(img, bg_thresh))

    return text_regions(y_distr, img.shape[0], **kwargs)
Ejemplo n.º 6
0
def load_english_hand(base_dir,
                      samples=55,
                      width=20,
                      retain=0.8,
                      classes=62,
                      thresh=None,
                      colors=256):
    """
    Load english handwritten characters  (using pc tablet) from
    http://www.ee.surrey.ac.uk/CVSSP/demos/chars74k/

    Samples (55 of each; total 3410 samples)
    0-10:
        Numbers 0-9
    11-36:
        Uppercase
    37-62:
        Lowercase

    Args:
        base_dir (str): Directory containing sample data.
        samples (Optional[int]): Number of samples for each class.
        width (Optional[int]): Width and height of each sample.
            Defaults to 20.
        retain (Optional[float]): Percentage of sample data to retain as
            training data. The rest is used as test data. Defaults to 0.8.
        classes (Optional[int]): Number of classes. Defaults to 62.
        thresh (Optional[int]): Background threshold.
        colors (Optional[int]): Number of colors. Defaults to 256.

    Returns:
        numpy.ndarray: Training data.
        numpy.ndarray: Training labels.
        numpy.ndarray: Test data.
        numpy.ndarray: Test labels.
    """
    import time

    start = time.time()
    X = [None] * samples * classes
    y = [c for c in ALPHA_NUMERIC for x in xrange(samples)]
    default_thresh = thresh
    with open(os.path.join(base_dir, "all.txt~"), "r") as samples:
        for i, sample in enumerate(samples):
            sample = sample.strip()
            filename = os.path.join(base_dir, sample)
            img = cv2.imread(filename, 0)
            if default_thresh is None:
                thresh = default_background_threshold(img)
            else:
                thresh = default_thresh

            vec = transform(img, width, width, thresh)

            X[i] = vec.astype(np.float32) / colors
            LOGGER.debug("Loaded: {}".format(i))
    LOGGER.info("Loading training set: {} seconds".format(time.time() - start))

    return split_data(X, y, retain, classes)
Ejemplo n.º 7
0
def load_english_hand(base_dir, samples=55, width=20, retain=0.8,
                      classes=62, thresh=None, colors=256):
    """
    Load english handwritten characters  (using pc tablet) from
    http://www.ee.surrey.ac.uk/CVSSP/demos/chars74k/

    Samples (55 of each; total 3410 samples)
    0-10:
        Numbers 0-9
    11-36:
        Uppercase
    37-62:
        Lowercase

    Args:
        base_dir (str): Directory containing sample data.
        samples (Optional[int]): Number of samples for each class.
        width (Optional[int]): Width and height of each sample.
            Defaults to 20.
        retain (Optional[float]): Percentage of sample data to retain as
            training data. The rest is used as test data. Defaults to 0.8.
        classes (Optional[int]): Number of classes. Defaults to 62.
        thresh (Optional[int]): Background threshold.
        colors (Optional[int]): Number of colors. Defaults to 256.

    Returns:
        numpy.ndarray: Training data.
        numpy.ndarray: Training labels.
        numpy.ndarray: Test data.
        numpy.ndarray: Test labels.
    """
    import time

    start = time.time()
    X = [None] * samples * classes
    y = [c for c in ALPHA_NUMERIC for x in xrange(samples)]
    default_thresh = thresh
    with open(os.path.join(base_dir, "all.txt~"), "r") as samples:
        for i, sample in enumerate(samples):
            sample = sample.strip()
            filename = os.path.join(base_dir, sample)
            img = cv2.imread(filename, 0)
            if default_thresh is None:
                thresh = default_background_threshold(img)
            else:
                thresh = default_thresh

            vec = transform(img, width, width, thresh)

            X[i] = vec.astype(np.float32) / colors
            LOGGER.debug("Loaded: {}".format(i))
    LOGGER.info("Loading training set: {} seconds".format(time.time() - start))

    return split_data(X, y, retain, classes)
Ejemplo n.º 8
0
def main():
    args = region_argument_parser(extraction_argument_parser())

    # Resize the image
    resize_ratio = args.resize
    img = cv2.imread(args.filename, 0)
    if resize_ratio != 1:
        img = cv2.resize(img, None, fx=resize_ratio, fy=resize_ratio)
    LOGGER.info("Checking image {}".format(args.filename))
    LOGGER.info("resized image shape: {}".format(img.shape))

    # Get average background color
    h, w = img.shape[:2]
    ravel = img.ravel()
    fig1 = plt.figure()
    plt.hist(ravel, 256, [0, 256])
    #plt.title("Distribution of Pixel Colors in {}".format(args.filename))
    plt.ylabel("Count")
    plt.xlabel("Pixel Color")
    avg = np.mean(ravel)
    std = np.std(ravel)
    LOGGER.info("Image statistics")
    LOGGER.info("mean: {}".format(avg))
    LOGGER.info("std: {}".format(std))
    LOGGER.info("median: {}".format(np.median(ravel)))
    thresh = args.thresh
    if thresh is None:
        thresh = default_background_threshold(img)
    img = grayscale_to_black_and_white(img, thresh)
    LOGGER.info("background threshold: {}".format(thresh))
    fig1.show()

    fig = plt.figure()
    plt.imshow(img, cmap="gray")
    plt.xticks([]), plt.yticks([])
    fig.show()

    # Find colored pixels
    text_pixels = colored_pixels(img, thresh)
    xs, ys = zip(*text_pixels)

    # Plot distributions
    fig3 = plt.figure()
    plt.hist(ys, h, [0, h])
    plt.ylabel("Count")
    plt.xlabel("Y Position from Top")
    #plt.title("Distribution of Colored Pixels along Y-Axis")
    fig3.show()

    fig2 = plt.figure()
    plt.imshow(img, cmap='gray')
    assert len(xs) == len(ys)
    assert max(xs) <= w
    assert max(ys) <= h

    plt.scatter(xs, ys, marker=".", color="r")
    plt.xticks([]), plt.yticks([])  # to hide tick values on X and Y axis
    fig2.show()

    # Draw line boundaries
    line_positions = line_regions(img,
                                  min_dist=args.min_line_dist,
                                  bg_thresh=thresh,
                                  min_pixels=args.min_line_pixels)
    for pos in line_positions:
        plt.plot([0, w], [pos[0], pos[0]], color="b")
        plt.plot([0, w], [pos[1], pos[1]], color="b")

    # Draw character boundaries
    char_regions = character_regions(img,
                                     line_positions,
                                     bg_thresh=thresh,
                                     min_dist=args.min_char_dist,
                                     min_pixels=args.min_char_pixels)
    assert len(char_regions) == len(line_positions)

    for i, char_region in enumerate(char_regions):
        starty, endy = line_positions[i]
        for startx, endx in char_region:
            plt.plot([startx, startx], [starty, endy], color="b")
            plt.plot([endx, endx], [starty, endy], color="b")

    for i, (starty, endy) in enumerate(line_positions):
        points = [p for p in text_pixels if starty <= p[1] <= endy]
        xs2, _ = zip(*points)
        fig = plt.figure()
        plt.hist(xs2, bins=w, range=[0, w])
        plt.ylabel("Count")
        plt.xlabel("X Position from Left")
        #plt.title("Distribution of Colored Pixels on Line {} along X-Axis".format(i + 1))
        fig.show()

    fig = plt.figure()
    plt.imshow(img, cmap="gray")
    plt.xticks([]), plt.yticks([])
    for pos in line_positions:
        plt.plot([0, w], [pos[0], pos[0]], color="b")
        plt.plot([0, w], [pos[1], pos[1]], color="b")
    fig.show()

    fig = plt.figure()
    plt.imshow(img, cmap="gray")
    plt.xticks([]), plt.yticks([])
    for pos in line_positions:
        plt.plot([0, w], [pos[0], pos[0]], color="b")
        plt.plot([0, w], [pos[1], pos[1]], color="b")
    for i, char_region in enumerate(char_regions):
        starty, endy = line_positions[i]
        for startx, endx in char_region:
            plt.plot([startx, startx], [starty, endy], color="b")
            plt.plot([endx, endx], [starty, endy], color="b")
    fig.show()

    if args.save_dir:
        if not args.labels:
            raise RuntimeError(
                "If you are saving the characters as sample/training data, "
                "you must provided a comma-separated string of labels to "
                "save each sample as.")
        save_images(img, line_positions, char_regions, args.labels,
                    args.save_dir)
    else:
        # Keep figures alive
        raw_input("Press 'Enter' to close the current images.")
    return 0
Ejemplo n.º 9
0
def get_text_from_regions(img,
                          line_regs,
                          char_regs,
                          classifier,
                          bg_thresh=None,
                          resize=None,
                          **kwargs):
    """
    Try to extract text from an image given the line and character
    regions, and classifier.

    Args:
        img (numpy.ndarray): Image to extract text from. This image does not
            need to be grayscaled since it will be done in this function, but
            the image should be unrotated and be a dark text on a light
            background.
        line_regs (list[tuple[int, int]]): Line regions in image.
        char_regs (list[list[tuple[int, int]]]): Character regions in each
            line in the image.
        classifier (Classifier): Classifier to use to predict characters.
        resize (Optional[tuple[int, int]): Dimensions to resize the image to.
            Defaults to None (indicating we should not resize). If provided,
            it should be a tuple containing the width and height.
            Ex: (width, height)
        **kwargs: Keyword arguments to pass to classifier predict method.

    Returns:
        str: The string extracted from the image.
    """
    avg_char_dist = avg_dist_between_chars(char_regs)
    LOGGER.debug("avg char dist: {}".format(avg_char_dist))

    chars = ""
    default_thresh = bg_thresh
    for i, char_region in enumerate(char_regs):
        starty, endy = line_regs[i]
        last_end = None
        LOGGER.debug("Checking line {}".format(i))
        for startx, endx in char_region:
            sub_img = img[starty:endy + 1, startx:endx + 1]

            LOGGER.debug("Transforming region between points {} {}".format(
                (startx, starty), (endx, endy)))

            if default_thresh is None:
                bg_thresh = default_background_threshold(sub_img)
            LOGGER.debug("background threshold: {}".format(bg_thresh))

            if resize is not None:
                sub_img = transform(sub_img, resize[0], resize[1], bg_thresh)
                if sub_img is None:
                    continue
            prediction = classifier.predict([sub_img], **kwargs)[0]
            LOGGER.debug("prediction: {}".format(prediction))

            # Add space if characters are far enough apart
            if last_end is not None:
                dist = startx - last_end
                if dist > avg_char_dist:
                    chars += " "

            chars += prediction
            last_end = endx
        chars += "\n"
    return chars
Ejemplo n.º 10
0
def main():
    args = region_argument_parser(extraction_argument_parser())

    # Resize the image
    resize_ratio = args.resize
    img = cv2.imread(args.filename, 0)
    if resize_ratio != 1:
        img = cv2.resize(img, None, fx=resize_ratio, fy=resize_ratio)
    LOGGER.info("Checking image {}".format(args.filename))
    LOGGER.info("resized image shape: {}".format(img.shape))

    # Get average background color
    h, w = img.shape[:2]
    ravel = img.ravel()
    fig1 = plt.figure()
    plt.hist(ravel, 256, [0, 256])
    #plt.title("Distribution of Pixel Colors in {}".format(args.filename))
    plt.ylabel("Count")
    plt.xlabel("Pixel Color")
    avg = np.mean(ravel)
    std = np.std(ravel)
    LOGGER.info("Image statistics")
    LOGGER.info("mean: {}".format(avg))
    LOGGER.info("std: {}".format(std))
    LOGGER.info("median: {}".format(np.median(ravel)))
    thresh = args.thresh
    if thresh is None:
        thresh = default_background_threshold(img)
    img = grayscale_to_black_and_white(img, thresh)
    LOGGER.info("background threshold: {}".format(thresh))
    fig1.show()

    fig = plt.figure()
    plt.imshow(img, cmap="gray")
    plt.xticks([]), plt.yticks([])
    fig.show()


    # Find colored pixels
    text_pixels = colored_pixels(img, thresh)
    xs, ys = zip(*text_pixels)

    # Plot distributions
    fig3 = plt.figure()
    plt.hist(ys, h, [0, h])
    plt.ylabel("Count")
    plt.xlabel("Y Position from Top")
    #plt.title("Distribution of Colored Pixels along Y-Axis")
    fig3.show()

    fig2 = plt.figure()
    plt.imshow(img, cmap='gray')
    assert len(xs) == len(ys)
    assert max(xs) <= w
    assert max(ys) <= h

    plt.scatter(xs, ys, marker=".", color="r")
    plt.xticks([]), plt.yticks([])  # to hide tick values on X and Y axis
    fig2.show()

    # Draw line boundaries
    line_positions = line_regions(img, min_dist=args.min_line_dist,
                                    bg_thresh=thresh,
                                    min_pixels=args.min_line_pixels)
    for pos in line_positions:
        plt.plot([0, w], [pos[0], pos[0]], color="b")
        plt.plot([0, w], [pos[1], pos[1]], color="b")


    # Draw character boundaries
    char_regions = character_regions(
        img, line_positions, bg_thresh=thresh, min_dist=args.min_char_dist,
        min_pixels=args.min_char_pixels)
    assert len(char_regions) == len(line_positions)


    for i, char_region in enumerate(char_regions):
        starty, endy = line_positions[i]
        for startx, endx in char_region:
            plt.plot([startx, startx], [starty, endy], color="b")
            plt.plot([endx, endx], [starty, endy], color="b")

    for i, (starty, endy) in enumerate(line_positions):
        points = [p for p in text_pixels if starty <= p[1] <= endy]
        xs2, _ = zip(*points)
        fig = plt.figure()
        plt.hist(xs2, bins=w, range=[0, w])
        plt.ylabel("Count")
        plt.xlabel("X Position from Left")
        #plt.title("Distribution of Colored Pixels on Line {} along X-Axis".format(i + 1))
        fig.show()

    fig = plt.figure()
    plt.imshow(img, cmap="gray")
    plt.xticks([]), plt.yticks([])
    for pos in line_positions:
        plt.plot([0, w], [pos[0], pos[0]], color="b")
        plt.plot([0, w], [pos[1], pos[1]], color="b")
    fig.show()

    fig = plt.figure()
    plt.imshow(img, cmap="gray")
    plt.xticks([]), plt.yticks([])
    for pos in line_positions:
        plt.plot([0, w], [pos[0], pos[0]], color="b")
        plt.plot([0, w], [pos[1], pos[1]], color="b")
    for i, char_region in enumerate(char_regions):
        starty, endy = line_positions[i]
        for startx, endx in char_region:
            plt.plot([startx, startx], [starty, endy], color="b")
            plt.plot([endx, endx], [starty, endy], color="b")
    fig.show()

    if args.save_dir:
        if not args.labels:
            raise RuntimeError(
                "If you are saving the characters as sample/training data, "
                "you must provided a comma-separated string of labels to "
                "save each sample as.")
        save_images(img, line_positions, char_regions, args.labels,
                    args.save_dir)
    else:
        # Keep figures alive
        raw_input("Press 'Enter' to close the current images.")
    return 0