def create_text_file_from_page(page: Page, path_to_save_file=None):
    article_dict = page.get_article_dict()
    with open(path_to_save_file, 'w') as f:
        for i, textlines in enumerate(article_dict.values()):
            for tl in textlines:
                if tl.text:
                    f.write(tl.text + "\n")
            if i != len(article_dict) - 1:
                f.write('\n' + '#' * 100 + '\n\n')
def create_text_files_from_page_list(page_list, path_to_save_folder=None):
    for page in page_list:
        page_file_name = os.path.basename(page)
        if path_to_save_folder:
            path_to_save_file = os.path.join(path_to_save_folder,
                                             page_file_name + '.txt')
        else:
            path_to_save_file = page + ".txt"
        page = Page(page)
        create_text_file_from_page(page, path_to_save_file)
def save_results_in_pagexml(path_to_pagexml, text_region_txtline_dict):
    """
    :param path_to_pagexml: file path
    :param text_region_txtline_dict: dictionray {text region id: (list of boundary_points,
                                                list of corresponding text lines, reading order of the region)}
    """
    page_file = Page(path_to_pagexml)
    lst_of_txtregions = []

    for txtregion_id in text_region_txtline_dict:
        boundary_points = text_region_txtline_dict[txtregion_id][0]
        lst_of_txtlines = text_region_txtline_dict[txtregion_id][1]
        reading_order = text_region_txtline_dict[txtregion_id][2]

        # set the reading order of the text lines
        txtlines_set_reading_order(lst_of_txtlines=lst_of_txtlines)

        # generation of the text region
        txtregion = TextRegion(_id=txtregion_id, region_type="paragraph",
                               custom={"readingOrder": {"index": reading_order}},
                               points=boundary_points, text_lines=lst_of_txtlines)
        lst_of_txtregions.append(txtregion)

    page_file.set_text_regions(text_regions=lst_of_txtregions, overwrite=True)
    page_file.write_page_xml(path_to_pagexml)
Esempio n. 4
0
def get_article_rectangles_from_surr_polygons(page, use_max_rect_size=True, max_d=0, max_rect_size_scale=1 / 50,
                                              max_d_scale=1 / 20):
    """Given the PageXml file `page` return the corresponding article subregions as a list of ArticleRectangle objects.
     Also returns the width and height of the image (NOT of the PrintSpace).

    :param page: Either the path to the PageXml file or a Page object.
    :type page: Union[str, Page]
    :param use_max_rect_size: whether to use a max rectangle size for the article rectangles or not
    :type use_max_rect_size: bool
    :return: the article subregion list, the height and the width of the image
    """
    if type(page) == str:
        page = Page(page)

    assert type(page) == Page, f"Type must be Page, got {type(page)} instead."
    ps_coords = page.get_print_space_coords()
    ps_poly = Points(ps_coords).to_polygon()
    # Maybe check if the surrounding Rectangle of the polygon has corners given by ps_poly
    ps_rectangle = ps_poly.get_bounding_box()

    # First ArticleRectangle to consider
    ps_rectangle = ArticleRectangle(ps_rectangle.x, ps_rectangle.y, ps_rectangle.width, ps_rectangle.height,
                                    page.get_textlines())

    if use_max_rect_size:
        max_rect_size = int(max_rect_size_scale * ps_rectangle.height)
    else:
        max_rect_size = 0
    if not max_d:
        max_d = int(max_d_scale * ps_rectangle.height)

    ars = ps_rectangle.create_subregions_from_surrounding_polygon(max_d=max_d, max_rect_size=max_rect_size)

    # ars = ps_rectangle.create_subregions_from_surrounding_polygon(max_d=int(1 / 20 * ps_rectangle.height))

    img_width, img_height = page.get_image_resolution()

    return ars, img_height, img_width
Esempio n. 5
0
def get_data_from_pagexml(path_to_pagexml):
    """
    :param path_to_pagexml: file path

    :return: dictionary with the article / block ID's as keys and a list of corresponding baselines (given by polygons)
    as values
    """
    art_polygons_dict = {}

    try:
        # load the page xml file
        page_file = Page(path_to_xml=path_to_pagexml)
        # get all text lines article wise
        art_txtlines_dict = page_file.get_article_dict()
    except ():
        print("!! Can not load the lines of the Page XML {} !!\n".format(
            path_to_pagexml))
        return art_polygons_dict

    for article_id in art_txtlines_dict:
        for txtline in art_txtlines_dict[article_id]:
            try:
                # get the baseline of the text line as polygon
                polygon = txtline.baseline.to_polygon()
                # skipp baselines with less than two points
                if len(polygon.x_points) == len(polygon.y_points) > 1:
                    if article_id in art_polygons_dict:
                        art_polygons_dict[article_id].append(polygon)
                    else:
                        art_polygons_dict.update({article_id: [polygon]})
            except ():
                print(
                    "!! 'NoneType' object with id {} has no attribute 'to_polygon' !!\n"
                    .format(txtline.id))
                continue

    return art_polygons_dict
 def create_page_objects(self):
     return [Page(page_path) for page_path in self.page_path_lst]
Esempio n. 7
0
def plot_pagexml(page,
                 path_to_img,
                 ax=None,
                 plot_article=True,
                 plot_legend=True,
                 fill_regions=False,
                 use_page_image_resolution=False):
    if type(page) == str:
        page = Page(page)
    assert type(page) == Page, f"Type must be Page, got {type(page)} instead."

    # get baselines based on the article id
    article_dict = page.get_article_dict()
    if not article_dict:
        bcolors = []
        blines_list = []
    else:
        unique_ids = sorted(set(article_dict.keys()),
                            key=functools.cmp_to_key(compare_article_ids))
        if None in unique_ids:
            article_colors = dict(
                zip(unique_ids,
                    COLORS[:len(unique_ids) - 1] + [DEFAULT_COLOR]))
        else:
            article_colors = dict(zip(unique_ids, COLORS[:len(unique_ids)]))
        if plot_article:
            bcolors = [article_colors[id] for id in unique_ids]
        else:
            bcolors = [DEFAULT_COLOR] * len(article_dict)
        blines_list = [[
            textline.baseline.points_list for textline in article_dict[id]
            if textline.baseline
        ] for id in unique_ids]

    # elif None in article_dict:
    #     if plot_article:
    #         bcolors = COLORS[:len(article_dict) - 1] + [DEFAULT_COLOR]
    #     else:
    #         bcolors = [DEFAULT_COLOR] * len(article_dict)
    #
    #     blines_list = [[tline.baseline.points_list for tline in tlines if tline.baseline]
    #                    for (a_id, tlines) in article_dict.items() if a_id is not None] \
    #                   + [[tline.baseline.points_list for tline in article_dict[None] if tline.baseline]]
    # else:
    #     if plot_article:
    #         bcolors = COLORS[:len(article_dict)]
    #     else:
    #         bcolors = [DEFAULT_COLOR] * len(article_dict)
    #     blines_list = [[tline.baseline.points_list for tline in tlines] for tlines in article_dict.values()]

    region_dict = page.get_regions()
    if not region_dict:
        rcolors = {}
        region_dict_polygons = {}
    else:
        rcolors = {
            page_constants.sTEXTREGION: "darkgreen",
            page_constants.sSEPARATORREGION: "darkviolet",
            page_constants.sGRAPHICREGION: "darkcyan",
            page_constants.sIMAGEREGION: "darkblue",
            page_constants.sTABLEREGION: "darkorange",
            page_constants.sADVERTREGION: "yellow",
            page_constants.sLINEDRAWINGREGION: "salmon",
            page_constants.sCHARTREGION: "brown",
            page_constants.sCHEMREGION: "navy",
            page_constants.sMATHSREGION: "crimson",
            page_constants.sNOISEREGION: "darkkhaki",
            page_constants.sMUSICREGION: "firebrick",
            page_constants.sUNKNOWNREGION: "darkorchid"
        }
        region_dict_polygons = {
            region_name: [region.points.points_list for region in regions]
            for region_name, regions in region_dict.items()
        }

    # get surrounding polygons
    textlines = page.get_textlines()
    surr_polys = [
        tl.surr_p.points_list for tl in textlines if (tl and tl.surr_p)
    ]

    words = page.get_words()
    word_polys = [
        word.surr_p.points_list for word in words if (word and word.surr_p)
    ]

    # # Maximize plotting window
    # mng = plt.get_current_fig_manager()
    # mng.resize(*mng.window.maxsize())

    if use_page_image_resolution:
        page_height, page_width = page.get_image_resolution()
    else:
        page_height = page_width = None

    plot_ax(ax,
            path_to_img,
            blines_list,
            surr_polys,
            bcolors,
            region_dict_polygons,
            rcolors,
            word_polys,
            plot_legend,
            fill_regions=fill_regions,
            height=page_height,
            width=page_width)
Esempio n. 8
0
def get_article_rectangles_from_baselines(page, image_path, stretch=False, use_surr_polygons=True):
    if type(page) == str:
        page = Page(page)

    assert type(page) == Page, f"Type must be Page, got {type(page)} instead."
    article_dict = page.get_article_dict()

    article_rectangles_dict = defaultdict(list)

    if stretch:
        binarized_image = get_binarization(image_path)

    for article_id, textlines in article_dict.items():
        used_textline_ids = []
        sorted_textlines = sort_textlines_by_y(textlines)
        for i, textline in enumerate(sorted_textlines):
            # used_textline_ids = [tl.id for article_rectangle in article_rectangles_dict[article_id] for tl in
            #                      article_rectangle.textlines]
            if textline.id in used_textline_ids:
                continue

            baseline = textline.baseline.points_list
            baseline_polygon = textline.baseline.to_polygon()

            if use_surr_polygons:
                baseline_bounding_box = textline.surr_p.to_polygon().get_bounding_box() if textline.surr_p else baseline_polygon.get_bounding_box()
            else:
                baseline_bounding_box = baseline_polygon.get_bounding_box()

            # [ar for aid, ar in article_rectangles_dict.items() if aid != article_id]

            # print(baseline_bounding_box.get_vertices())
            # print(article_id)
            for ars in [ar for aid, ar in article_rectangles_dict.items() if aid != article_id]:
                for ar in ars:
                    intersection = ar.intersection(baseline_bounding_box)
                    for _ in range(20):
                        if intersection.width > 0 and intersection.height > 0:
                            baseline_bounding_box.translate(0, 1)
                            baseline_bounding_box.height -= 1
                            intersection = ar.intersection(baseline_bounding_box)
                        else:
                            break

            article_rectangle = ArticleRectangle(baseline_bounding_box.x, baseline_bounding_box.y,
                                                 baseline_bounding_box.width, baseline_bounding_box.height,
                                                 [textline], None)

            used_textline_ids.append(textline.id)
            if i == len(sorted_textlines):
                continue
            for j, textline_compare in enumerate(sorted_textlines[i + 1:]):
                if textline_compare.id in used_textline_ids:
                    continue
                # for tl in article_rectangle.textlines:
                #     print(tl.baseline.points_list)
                baseline_compare = textline_compare.baseline.points_list
                skip = False

                # instead of checking whether the two baselines are aligned, we should check, if the current article
                # rectangle and the baseline_compare are aligned!
                article_rectangle_horizontal_poly = article_rectangle.get_vertices()[:2]

                # if not is_vertical_aligned(baseline, baseline_compare):
                if not is_vertical_aligned(article_rectangle_horizontal_poly, baseline_compare):
                    if i + j + 2 != len(sorted_textlines):
                        for tl in sorted_textlines[i + j + 2:]:
                            if tl.id not in used_textline_ids:
                                if is_vertical_aligned(baseline, tl.baseline.points_list) and is_vertical_aligned(
                                        baseline_compare, tl.baseline.points_list, margin=50):
                                    skip = False
                                    break
                                else:
                                    skip = True
                    else:
                        skip = True
                if skip:
                    continue

                baseline_compare_polygon = textline_compare.baseline.to_polygon()
                if use_surr_polygons:
                    baseline_compare_bounding_box = textline_compare.surr_p.to_polygon().get_bounding_box() if textline_compare.surr_p else baseline_compare_polygon.get_bounding_box()
                else:
                    baseline_compare_bounding_box = baseline_compare_polygon.get_bounding_box()

                merged_rectangle = merge_rectangles([article_rectangle, baseline_compare_bounding_box])

                skip = False
                for ars in article_rectangles_dict.values():
                    for ar in ars:
                        intersection = ar.intersection(merged_rectangle)
                        if intersection.width > 0 and intersection.height > 0:
                            skip = True
                            break
                    if skip:
                        break
                if skip:
                    continue

                merged_article_rectangle = ArticleRectangle(merged_rectangle.x, merged_rectangle.y,
                                                            merged_rectangle.width, merged_rectangle.height)
                # if merged_article_rectangle contains any other baseline, that is not yet in an article_rectangle, skip
                # textlines_to_check_intersection = [tl for tl in sorted_textlines if
                #                                    tl.id not in used_textline_ids and tl.id != textline_compare.id]
                textlines_to_check_intersection = []
                textlines_to_check_intersection += [tl for textlines in
                                                    [article_dict[aid] for aid in article_dict if
                                                     aid != article_id] for tl in textlines]
                # polygons_to_check_intersection = [tl.surr_p.to_polygon() if tl.surr_p is not None else
                #                                   tl.baseline.to_polygon() for tl in textlines_to_check_intersection]
                polygons_to_check_intersection = [tl.baseline.to_polygon() for tl in textlines_to_check_intersection]

                skip = False
                for polygon in polygons_to_check_intersection:
                    if merged_article_rectangle.contains_polygon(polygon, merged_article_rectangle.x,
                                                                 merged_article_rectangle.y,
                                                                 merged_article_rectangle.width,
                                                                 merged_article_rectangle.height):
                        skip = True

                        merged_article_rectangle_copy = copy.deepcopy(merged_article_rectangle)
                        for _ in range(50):
                            merged_article_rectangle_copy.translate(0, 1)
                            merged_article_rectangle_copy.height -= 1
                            if not merged_article_rectangle_copy.contains_polygon(polygon,
                                                                                  merged_article_rectangle_copy.x,
                                                                                  merged_article_rectangle_copy.y,
                                                                                  merged_article_rectangle_copy.width,
                                                                                  merged_article_rectangle_copy.height):
                                skip = False
                            merged_article_rectangle = merged_article_rectangle_copy
                            break

                    if skip:
                        break

                if skip:
                    continue

                article_rectangle.textlines.append(textline_compare)
                article_rectangle.set_bounds(merged_article_rectangle.x, merged_article_rectangle.y,
                                             merged_article_rectangle.width, merged_article_rectangle.height)
                used_textline_ids.append(textline_compare.id)

            if len(article_rectangle.textlines) == 1:
                if article_rectangle.textlines[0].surr_p:
                    # bb = article_rectangle.textlines[0].surr_p.to_polygon().get_bounding_box()
                    # article_rectangle.set_bounds(bb.x, bb.y, bb.width, bb.height)
                    pass
                else:
                    article_rectangle.translate(0, -10)
                    article_rectangle.height = 10

            if stretch:
                img_height = len(binarized_image)
                article_rectangle = stretch_rectangle_until_whitespace(binarized_image, article_rectangle,
                                                                       whitespace_height=max(1, img_height // 1000),
                                                                       stretch_limit=img_height // 10)

            article_rectangles_dict[article_id].append(article_rectangle)

    return article_rectangles_dict
Esempio n. 9
0
    # img_path = "/home/max/data/as/NewsEye_ONB_data_corrected/krz/ONB_krz_19110701_corrected/ONB_krz_19110701_016" \
    #            ".jpg"
    # #
    img_path = "/home/max/devel/projects/article_separation/data/newseye_onb/ibn/ONB_ibn_18640702_corrected/ONB_ibn_18640702_003.tif"
    xml_path = "/home/max/devel/projects/article_separation/data/newseye_onb/ibn/ONB_ibn_18640702_corrected/page/ONB_ibn_18640702_003.xml"

    # xml_path = "/home/max/data/as/NewsEye_ONB_data_corrected/ibn/ONB_ibn_19330701_corrected/page/ONB_ibn_19330701_001.xml"
    # img_path = "/home/max/data/as/NewsEye_ONB_data_corrected/ibn/ONB_ibn_19330701_corrected/ONB_ibn_19330701_001.jpg"
    # # #
    # xml_path = "/home/max/data/as/NewsEye_ONB_data_corrected/nfp/ONB_nfp_18730705_corrected/page/ONB_nfp_18730705_016.xml"
    # img_path = "/home/max/data/as/NewsEye_ONB_data_corrected/nfp/ONB_nfp_18730705_corrected/ONB_nfp_18730705_016.tif"
    #
    # xml_path = '/home/max/data/as/NewsEye_ONB_data_corrected/nfp/ONB_nfp_18950706_corrected/page/ONB_nfp_18950706_015.xml'
    # img_path = '/home/max/data/as/NewsEye_ONB_data_corrected/nfp/ONB_nfp_18950706_corrected/ONB_nfp_18950706_015.tif'

    article_rectangles_dict = get_article_rectangles_from_baselines(Page(xml_path), img_path, use_surr_polygons=True,
                                                                    stretch=False)

    surr_polys_dict = merge_article_rectangles_vertically(article_rectangles_dict)

    import matplotlib.pyplot as plt
    from citlab_python_util.parser.xml.page import plot as page_plot
    from matplotlib.collections import PolyCollection
    from citlab_python_util.plot import colors

    # page_plot.plot_pagexml(xml_path, img_path)

    fig, ax = plt.subplots()
    page_plot.add_image(ax, img_path)

    for i, a_id in enumerate(surr_polys_dict):
def get_data_from_pagexml(path_to_pagexml, des_dist=50, max_d=500, use_java_code=True):
    """
    :param path_to_pagexml: file path
    :param des_dist: desired distance (measured in pixels) of two adjacent pixels in the normed polygons
    :param max_d: maximum distance (measured in pixels) for the calculation of the interline distances
    :param use_java_code: usage of methods written in java (faster than python!) or not

    :return: two dictionaries: {article id: corresponding list of text lines}
                               {text line id: (normed polygon, interline distance)}
    """
    # load the page xml file
    page_file = Page(path_to_pagexml)

    # get all text lines article wise
    art_txtlines_dict = page_file.get_article_dict()
    # get all text lines of the loaded page file
    lst_of_txtlines = page_file.get_textlines()

    lst_of_polygons = []
    lst_of_txtlines_adjusted = []

    for txtline in lst_of_txtlines:
        try:
            # get the baseline of the text line as polygon
            baseline = txtline.baseline.to_polygon()
            # baselines with less than two points will skipped
            if len(baseline.x_points) == len(baseline.y_points) > 1:
                lst_of_polygons.append(txtline.baseline.to_polygon())
                lst_of_txtlines_adjusted.append(txtline)
        except(AttributeError):
            # print("'NoneType' object in PAGEXML with id {} has no attribute 'to_polygon'!\n".format(txtline.id))
            continue

    # normed polygons
    lst_of_normed_polygons = norm_poly_dists(poly_list=lst_of_polygons, des_dist=des_dist)
    # interline distances
    lst_of_intdists = get_list_of_interline_distances(lst_of_polygons=lst_of_polygons, max_d=max_d,
                                                      use_java_code=use_java_code)

    txtline_dict = {}
    for i, txtline in enumerate(lst_of_txtlines_adjusted):
        # check the surrounding polygon of the text line
        if txtline.surr_p is None:
            normed_polygon = lst_of_normed_polygons[i]

            x_points_shifted = [x + 1 for x in normed_polygon.x_points]
            # y values are shifted upwards by at least one pixel
            y_shift = max(int(0.95 * lst_of_intdists[i]), 1)
            y_points_shifted = [y - y_shift for y in normed_polygon.y_points]

            sp_points = list(zip(normed_polygon.x_points + x_points_shifted[::-1],
                                 normed_polygon.y_points + y_points_shifted[::-1]))

            for article in art_txtlines_dict:
                for reference_txtline in art_txtlines_dict[article]:
                    if reference_txtline.id == txtline.id:
                        reference_txtline.surr_p = Points(sp_points)

        txtline_dict.update({txtline.id: (lst_of_normed_polygons[i], lst_of_intdists[i])})

    return art_txtlines_dict, txtline_dict
Esempio n. 11
0
                files_exist = check_if_files_exist(
                    article_gt_filename, article_boundary_gt_filename,
                    other_gt_filename, downscaled_grey_img_filename,
                    rotation_filename)

            if files_exist:
                print(
                    f"GT Files for PageXml {path_to_page_xml} already exist, skipping..."
                )
                continue

            # TODO: only generates files with '0's in it -> fix this
            with open(rotation_filename, "w") as rot:
                rot.write("0")

            page = Page(path_to_page_xml)
            img_width, img_height = page.get_image_resolution()
            article_rectangle_dict = get_article_rectangles_from_baselines(
                page,
                path_to_img,
                use_surr_polygons=args.use_surr_polys,
                stretch=args.use_stretch)

            if args.fixed_img_height:
                sc_factor = args.fixed_img_height / img_height
            else:
                sc_factor = args.scaling_factor

            surr_polys_dict = merge_article_rectangles_vertically(
                article_rectangle_dict,
                min_width_intersect=args.min_width_intersect,
                            f'Skipping image {curr_img} since the result files for query {_query} already exist.'
                        )
                        skip = True
                if skip:
                    continue

                # Get full path
                for image_path in image_paths:
                    if curr_img in image_path:
                        curr_img_path = image_path
                        break
                # print(curr_img_path)

                # open corresponding PAGE file
                page_path = get_corresponding_page_path(curr_img_path)
                page = Page(page_path)
                article_dict = page.get_article_dict()
                used_article_ids = []

                fig, ax = plt.subplots()
                plot.add_image(ax, curr_img_path)

                highest_conf_img = 0.0

                with open(
                        os.path.join(path_to_query_folder, curr_img + ".txt"),
                        'w+') as text_file:
                    text_file.write(f"QUERY: '{_query}'\n\n")

                for hit in query_results:
                    if hit[0] == curr_img:
    #                    "newseye_as_test_data/xml_files_gt/19000715_1-0003.xml"
    # path_to_img = "/home/johannes/devel/projects/as/ArticleSeparationMeasure/test/resources/" \
    #               "newseye_as_test_data/image_files/19000715_1-0003.jpg"

    path_to_page_xml = '/home/max/devel/projects/article_separation/data/newseye_onb/aze/ONB_aze_18950706_corrected/' \
                       'page/ONB_aze_18950706_5.xml'
    path_to_img = '/home/max/devel/projects/article_separation/data/newseye_onb/aze/ONB_aze_18950706_corrected/' \
                  'ONB_aze_18950706_5.jpg'

    # path_to_page_xml = "/home/johannes/devel/projects/as/ArticleSeparationMeasure/test/resources/" \
    #                    "newseye_as_test_data/xml_files_gt/19420115_1-0002.xml"
    # path_to_img = "/home/johannes/devel/projects/as/ArticleSeparationMeasure/test/resources/" \
    #               "newseye_as_test_data/image_files/19420115_1-0002.jpg"

    path_to_page_xml = path_to_page_xml.strip()
    page = Page(path_to_page_xml)

    # Get the article rectangles as a list of ArticleRectangle objects
    ars, img_height, img_width = get_article_rectangles_from_surr_polygons(page)

    # resize the image to draw the border polygons (if available)
    img_height += 1
    img_width += 1
    print("img_height = {}, img_width = {}".format(img_height, img_width))

    # Convert the list of article rectangles to a dictionary with the article ids as keys
    # and the corresponding list of rectangles as value
    ars_dict = filter_by_attribute(ars, "a_ids")
    print("Len(Blank) = ", len(ars_dict["blank"]))

    # Convert blank article rectangles (by rectangles)