def create_text_file_from_page(page: Page, path_to_save_file=None):
    article_dict = page.get_article_dict()
    with open(path_to_save_file, 'w') as f:
        for i, textlines in enumerate(article_dict.values()):
            for tl in textlines:
                if tl.text:
                    f.write(tl.text + "\n")
            if i != len(article_dict) - 1:
                f.write('\n' + '#' * 100 + '\n\n')
Example #2
0
def get_data_from_pagexml(path_to_pagexml):
    """
    :param path_to_pagexml: file path

    :return: dictionary with the article / block ID's as keys and a list of corresponding baselines (given by polygons)
    as values
    """
    art_polygons_dict = {}

    try:
        # load the page xml file
        page_file = Page(path_to_xml=path_to_pagexml)
        # get all text lines article wise
        art_txtlines_dict = page_file.get_article_dict()
    except ():
        print("!! Can not load the lines of the Page XML {} !!\n".format(
            path_to_pagexml))
        return art_polygons_dict

    for article_id in art_txtlines_dict:
        for txtline in art_txtlines_dict[article_id]:
            try:
                # get the baseline of the text line as polygon
                polygon = txtline.baseline.to_polygon()
                # skipp baselines with less than two points
                if len(polygon.x_points) == len(polygon.y_points) > 1:
                    if article_id in art_polygons_dict:
                        art_polygons_dict[article_id].append(polygon)
                    else:
                        art_polygons_dict.update({article_id: [polygon]})
            except ():
                print(
                    "!! 'NoneType' object with id {} has no attribute 'to_polygon' !!\n"
                    .format(txtline.id))
                continue

    return art_polygons_dict
Example #3
0
def plot_pagexml(page,
                 path_to_img,
                 ax=None,
                 plot_article=True,
                 plot_legend=True,
                 fill_regions=False,
                 use_page_image_resolution=False):
    if type(page) == str:
        page = Page(page)
    assert type(page) == Page, f"Type must be Page, got {type(page)} instead."

    # get baselines based on the article id
    article_dict = page.get_article_dict()
    if not article_dict:
        bcolors = []
        blines_list = []
    else:
        unique_ids = sorted(set(article_dict.keys()),
                            key=functools.cmp_to_key(compare_article_ids))
        if None in unique_ids:
            article_colors = dict(
                zip(unique_ids,
                    COLORS[:len(unique_ids) - 1] + [DEFAULT_COLOR]))
        else:
            article_colors = dict(zip(unique_ids, COLORS[:len(unique_ids)]))
        if plot_article:
            bcolors = [article_colors[id] for id in unique_ids]
        else:
            bcolors = [DEFAULT_COLOR] * len(article_dict)
        blines_list = [[
            textline.baseline.points_list for textline in article_dict[id]
            if textline.baseline
        ] for id in unique_ids]

    # elif None in article_dict:
    #     if plot_article:
    #         bcolors = COLORS[:len(article_dict) - 1] + [DEFAULT_COLOR]
    #     else:
    #         bcolors = [DEFAULT_COLOR] * len(article_dict)
    #
    #     blines_list = [[tline.baseline.points_list for tline in tlines if tline.baseline]
    #                    for (a_id, tlines) in article_dict.items() if a_id is not None] \
    #                   + [[tline.baseline.points_list for tline in article_dict[None] if tline.baseline]]
    # else:
    #     if plot_article:
    #         bcolors = COLORS[:len(article_dict)]
    #     else:
    #         bcolors = [DEFAULT_COLOR] * len(article_dict)
    #     blines_list = [[tline.baseline.points_list for tline in tlines] for tlines in article_dict.values()]

    region_dict = page.get_regions()
    if not region_dict:
        rcolors = {}
        region_dict_polygons = {}
    else:
        rcolors = {
            page_constants.sTEXTREGION: "darkgreen",
            page_constants.sSEPARATORREGION: "darkviolet",
            page_constants.sGRAPHICREGION: "darkcyan",
            page_constants.sIMAGEREGION: "darkblue",
            page_constants.sTABLEREGION: "darkorange",
            page_constants.sADVERTREGION: "yellow",
            page_constants.sLINEDRAWINGREGION: "salmon",
            page_constants.sCHARTREGION: "brown",
            page_constants.sCHEMREGION: "navy",
            page_constants.sMATHSREGION: "crimson",
            page_constants.sNOISEREGION: "darkkhaki",
            page_constants.sMUSICREGION: "firebrick",
            page_constants.sUNKNOWNREGION: "darkorchid"
        }
        region_dict_polygons = {
            region_name: [region.points.points_list for region in regions]
            for region_name, regions in region_dict.items()
        }

    # get surrounding polygons
    textlines = page.get_textlines()
    surr_polys = [
        tl.surr_p.points_list for tl in textlines if (tl and tl.surr_p)
    ]

    words = page.get_words()
    word_polys = [
        word.surr_p.points_list for word in words if (word and word.surr_p)
    ]

    # # Maximize plotting window
    # mng = plt.get_current_fig_manager()
    # mng.resize(*mng.window.maxsize())

    if use_page_image_resolution:
        page_height, page_width = page.get_image_resolution()
    else:
        page_height = page_width = None

    plot_ax(ax,
            path_to_img,
            blines_list,
            surr_polys,
            bcolors,
            region_dict_polygons,
            rcolors,
            word_polys,
            plot_legend,
            fill_regions=fill_regions,
            height=page_height,
            width=page_width)
Example #4
0
def get_article_rectangles_from_baselines(page, image_path, stretch=False, use_surr_polygons=True):
    if type(page) == str:
        page = Page(page)

    assert type(page) == Page, f"Type must be Page, got {type(page)} instead."
    article_dict = page.get_article_dict()

    article_rectangles_dict = defaultdict(list)

    if stretch:
        binarized_image = get_binarization(image_path)

    for article_id, textlines in article_dict.items():
        used_textline_ids = []
        sorted_textlines = sort_textlines_by_y(textlines)
        for i, textline in enumerate(sorted_textlines):
            # used_textline_ids = [tl.id for article_rectangle in article_rectangles_dict[article_id] for tl in
            #                      article_rectangle.textlines]
            if textline.id in used_textline_ids:
                continue

            baseline = textline.baseline.points_list
            baseline_polygon = textline.baseline.to_polygon()

            if use_surr_polygons:
                baseline_bounding_box = textline.surr_p.to_polygon().get_bounding_box() if textline.surr_p else baseline_polygon.get_bounding_box()
            else:
                baseline_bounding_box = baseline_polygon.get_bounding_box()

            # [ar for aid, ar in article_rectangles_dict.items() if aid != article_id]

            # print(baseline_bounding_box.get_vertices())
            # print(article_id)
            for ars in [ar for aid, ar in article_rectangles_dict.items() if aid != article_id]:
                for ar in ars:
                    intersection = ar.intersection(baseline_bounding_box)
                    for _ in range(20):
                        if intersection.width > 0 and intersection.height > 0:
                            baseline_bounding_box.translate(0, 1)
                            baseline_bounding_box.height -= 1
                            intersection = ar.intersection(baseline_bounding_box)
                        else:
                            break

            article_rectangle = ArticleRectangle(baseline_bounding_box.x, baseline_bounding_box.y,
                                                 baseline_bounding_box.width, baseline_bounding_box.height,
                                                 [textline], None)

            used_textline_ids.append(textline.id)
            if i == len(sorted_textlines):
                continue
            for j, textline_compare in enumerate(sorted_textlines[i + 1:]):
                if textline_compare.id in used_textline_ids:
                    continue
                # for tl in article_rectangle.textlines:
                #     print(tl.baseline.points_list)
                baseline_compare = textline_compare.baseline.points_list
                skip = False

                # instead of checking whether the two baselines are aligned, we should check, if the current article
                # rectangle and the baseline_compare are aligned!
                article_rectangle_horizontal_poly = article_rectangle.get_vertices()[:2]

                # if not is_vertical_aligned(baseline, baseline_compare):
                if not is_vertical_aligned(article_rectangle_horizontal_poly, baseline_compare):
                    if i + j + 2 != len(sorted_textlines):
                        for tl in sorted_textlines[i + j + 2:]:
                            if tl.id not in used_textline_ids:
                                if is_vertical_aligned(baseline, tl.baseline.points_list) and is_vertical_aligned(
                                        baseline_compare, tl.baseline.points_list, margin=50):
                                    skip = False
                                    break
                                else:
                                    skip = True
                    else:
                        skip = True
                if skip:
                    continue

                baseline_compare_polygon = textline_compare.baseline.to_polygon()
                if use_surr_polygons:
                    baseline_compare_bounding_box = textline_compare.surr_p.to_polygon().get_bounding_box() if textline_compare.surr_p else baseline_compare_polygon.get_bounding_box()
                else:
                    baseline_compare_bounding_box = baseline_compare_polygon.get_bounding_box()

                merged_rectangle = merge_rectangles([article_rectangle, baseline_compare_bounding_box])

                skip = False
                for ars in article_rectangles_dict.values():
                    for ar in ars:
                        intersection = ar.intersection(merged_rectangle)
                        if intersection.width > 0 and intersection.height > 0:
                            skip = True
                            break
                    if skip:
                        break
                if skip:
                    continue

                merged_article_rectangle = ArticleRectangle(merged_rectangle.x, merged_rectangle.y,
                                                            merged_rectangle.width, merged_rectangle.height)
                # if merged_article_rectangle contains any other baseline, that is not yet in an article_rectangle, skip
                # textlines_to_check_intersection = [tl for tl in sorted_textlines if
                #                                    tl.id not in used_textline_ids and tl.id != textline_compare.id]
                textlines_to_check_intersection = []
                textlines_to_check_intersection += [tl for textlines in
                                                    [article_dict[aid] for aid in article_dict if
                                                     aid != article_id] for tl in textlines]
                # polygons_to_check_intersection = [tl.surr_p.to_polygon() if tl.surr_p is not None else
                #                                   tl.baseline.to_polygon() for tl in textlines_to_check_intersection]
                polygons_to_check_intersection = [tl.baseline.to_polygon() for tl in textlines_to_check_intersection]

                skip = False
                for polygon in polygons_to_check_intersection:
                    if merged_article_rectangle.contains_polygon(polygon, merged_article_rectangle.x,
                                                                 merged_article_rectangle.y,
                                                                 merged_article_rectangle.width,
                                                                 merged_article_rectangle.height):
                        skip = True

                        merged_article_rectangle_copy = copy.deepcopy(merged_article_rectangle)
                        for _ in range(50):
                            merged_article_rectangle_copy.translate(0, 1)
                            merged_article_rectangle_copy.height -= 1
                            if not merged_article_rectangle_copy.contains_polygon(polygon,
                                                                                  merged_article_rectangle_copy.x,
                                                                                  merged_article_rectangle_copy.y,
                                                                                  merged_article_rectangle_copy.width,
                                                                                  merged_article_rectangle_copy.height):
                                skip = False
                            merged_article_rectangle = merged_article_rectangle_copy
                            break

                    if skip:
                        break

                if skip:
                    continue

                article_rectangle.textlines.append(textline_compare)
                article_rectangle.set_bounds(merged_article_rectangle.x, merged_article_rectangle.y,
                                             merged_article_rectangle.width, merged_article_rectangle.height)
                used_textline_ids.append(textline_compare.id)

            if len(article_rectangle.textlines) == 1:
                if article_rectangle.textlines[0].surr_p:
                    # bb = article_rectangle.textlines[0].surr_p.to_polygon().get_bounding_box()
                    # article_rectangle.set_bounds(bb.x, bb.y, bb.width, bb.height)
                    pass
                else:
                    article_rectangle.translate(0, -10)
                    article_rectangle.height = 10

            if stretch:
                img_height = len(binarized_image)
                article_rectangle = stretch_rectangle_until_whitespace(binarized_image, article_rectangle,
                                                                       whitespace_height=max(1, img_height // 1000),
                                                                       stretch_limit=img_height // 10)

            article_rectangles_dict[article_id].append(article_rectangle)

    return article_rectangles_dict
def get_data_from_pagexml(path_to_pagexml, des_dist=50, max_d=500, use_java_code=True):
    """
    :param path_to_pagexml: file path
    :param des_dist: desired distance (measured in pixels) of two adjacent pixels in the normed polygons
    :param max_d: maximum distance (measured in pixels) for the calculation of the interline distances
    :param use_java_code: usage of methods written in java (faster than python!) or not

    :return: two dictionaries: {article id: corresponding list of text lines}
                               {text line id: (normed polygon, interline distance)}
    """
    # load the page xml file
    page_file = Page(path_to_pagexml)

    # get all text lines article wise
    art_txtlines_dict = page_file.get_article_dict()
    # get all text lines of the loaded page file
    lst_of_txtlines = page_file.get_textlines()

    lst_of_polygons = []
    lst_of_txtlines_adjusted = []

    for txtline in lst_of_txtlines:
        try:
            # get the baseline of the text line as polygon
            baseline = txtline.baseline.to_polygon()
            # baselines with less than two points will skipped
            if len(baseline.x_points) == len(baseline.y_points) > 1:
                lst_of_polygons.append(txtline.baseline.to_polygon())
                lst_of_txtlines_adjusted.append(txtline)
        except(AttributeError):
            # print("'NoneType' object in PAGEXML with id {} has no attribute 'to_polygon'!\n".format(txtline.id))
            continue

    # normed polygons
    lst_of_normed_polygons = norm_poly_dists(poly_list=lst_of_polygons, des_dist=des_dist)
    # interline distances
    lst_of_intdists = get_list_of_interline_distances(lst_of_polygons=lst_of_polygons, max_d=max_d,
                                                      use_java_code=use_java_code)

    txtline_dict = {}
    for i, txtline in enumerate(lst_of_txtlines_adjusted):
        # check the surrounding polygon of the text line
        if txtline.surr_p is None:
            normed_polygon = lst_of_normed_polygons[i]

            x_points_shifted = [x + 1 for x in normed_polygon.x_points]
            # y values are shifted upwards by at least one pixel
            y_shift = max(int(0.95 * lst_of_intdists[i]), 1)
            y_points_shifted = [y - y_shift for y in normed_polygon.y_points]

            sp_points = list(zip(normed_polygon.x_points + x_points_shifted[::-1],
                                 normed_polygon.y_points + y_points_shifted[::-1]))

            for article in art_txtlines_dict:
                for reference_txtline in art_txtlines_dict[article]:
                    if reference_txtline.id == txtline.id:
                        reference_txtline.surr_p = Points(sp_points)

        txtline_dict.update({txtline.id: (lst_of_normed_polygons[i], lst_of_intdists[i])})

    return art_txtlines_dict, txtline_dict
                        )
                        skip = True
                if skip:
                    continue

                # Get full path
                for image_path in image_paths:
                    if curr_img in image_path:
                        curr_img_path = image_path
                        break
                # print(curr_img_path)

                # open corresponding PAGE file
                page_path = get_corresponding_page_path(curr_img_path)
                page = Page(page_path)
                article_dict = page.get_article_dict()
                used_article_ids = []

                fig, ax = plt.subplots()
                plot.add_image(ax, curr_img_path)

                highest_conf_img = 0.0

                with open(
                        os.path.join(path_to_query_folder, curr_img + ".txt"),
                        'w+') as text_file:
                    text_file.write(f"QUERY: '{_query}'\n\n")

                for hit in query_results:
                    if hit[0] == curr_img:
                        line_id = hit[2]