def fix_image_tables(root): img_tables = root.xpath( '//table[contains(@class, "short-table") and not(contains(@class, "infobox")) and .//a[contains(@class, "image")]]' ) for table in img_tables: utils.remove_node_styles(table, "margin") utils.append_class(table, "image-table") max_widths = {} for row in table.xpath(".//tr"): for n, column in enumerate(row.xpath(".//td")): for img in column.xpath(".//img"): width = utils.get_node_width(img, target_unit="px") max_widths[n] = max(width, max_widths.get(n, 0)) total_width = sum(max_widths.values()) if total_width * config.px2pt > config.page_width_pt: utils.append_class(table, "wide-image-table") for row in table.xpath(".//tr"): for n, column in enumerate(row.xpath(".//td")): _remove_inner_image_node_width(column, "image") utils.remove_node_styles(column, ["padding-left", "padding", "margin"]) utils.add_node_style( column, "width", "{}%".format(max_widths.get(n, 0) / total_width * 100) ) elif total_width > 0: for img in table.xpath(".//img"): _resize_image_node_width_to_pt(img)
def handle_tiny_table(node, width, height): """ float small tables - if they are followed by a sufficient amount of text """ if node_is_floatable(node, width, height): utils.append_class(node, "pp_float_table")
def remove_img_style_size(root): """ add class to img container and remove explicit width attributes """ xpath_conditions = [ 'contains(@class,"thumb") ', 'and not(contains(@class, "tmulti"))', 'and not(contains(@class, "thumbinner"))', 'and not(contains(@class, "thumbcaption"))', 'and not(contains(@class, "thumbimage"))', ] result = root.xpath("//div[{}]".format(" ".join(xpath_conditions))) for img_container in result: if "map" in img_container.attrib.get("class", ""): continue thumbinner = img_container.xpath('.//*[contains(@class,"thumbinner")]') for node in thumbinner: utils.remove_node_styles(node, ["width", "height", "max-width"]) if not img_container.xpath(".//img"): log.debug("No <img> found in {}".format(etree.tostring(img_container))) continue img = img_container.xpath(".//img")[0] width = utils.get_node_width(img, target_unit="pt") utils.remove_node_styles(img, ["width", "height"]) cols = int(round(width / (column_width_pt * 4))) if cols > 3: cols = 3 cols = cols * 4 utils.append_class(img_container, "col-{}".format(cols)) utils.remove_node_width(img_container) utils.remove_node_width(img)
def change_references_id_to_class(root): for node in root.xpath('//*[@id="References"]'): if node.tag == "h2": utils.append_class(node, "references") else: utils.append_class(node.getparent(), "references") del node.attrib["id"]
def move_caption(node): utils.append_class(node, "pp-table-caption") wrapper = E.div({"class": "pp-table"}) try: node[0][0].text = node[0][0].text.replace(":", "") node[0].tail = "" except: print("Error at: " + etree.tostring(node)) node_pos = node.getparent().index(node) nodelist = node.getparent().getchildren() indexpos = node_pos - 1 while nodelist[indexpos].tag in ["p", "ul"]: if nodelist[indexpos].get("class") and "gallery" in nodelist[indexpos].get("class"): break else: indexpos -= 1 # indexpos is the beef wrapper.append(node) if indexpos < 1: indexpos = 1 nodelist[indexpos - 1].addnext(wrapper) for i in range(indexpos, node_pos): wrapper.append(nodelist[i]) # add second caption to tables if wrapper[1].tag == "table": node2 = deepcopy(node) node2.tag = "caption" utils.append_class(node2, "following") wrapper[1].append(node2)
def remove_style_sizes(root): for table in root.xpath("//table[@style]"): utils.remove_node_styles(table, ["width", "height"]) utils.remove_node_width(table) if table.attrib.get("border"): del table.attrib["border"] utils.append_class(table, "pp_border_table")
def tag_local_images(root, collection): input_image_path = collection.get("image_path") if input_image_path and os.path.exists(input_image_path): input_images = os.listdir(input_image_path) for img in root.xpath("//img"): file_name = str.split(img.get("_src", ""), "/")[-1] if file_name in input_images: utils.append_class(img, "local-image")
def handle_col_floats(root): for node in root.xpath('//*[contains(@class, "infobox")]'): w, h = node_size(node) if h < config.min_float_height or (h < 2 * config.min_float_height and node_is_floatable(node, w, h)): utils.append_class(node, "pp_no_float") elif "pp_float_table" in node.get("class", ""): utils.remove_class(node, "pp_float_table")
def handle_span_all(node, width, height, two_col_max_size, debug): """ limit node width to max ? not sure about this ? """ if width > two_col_max_size: if debug: utils.add_node_style(node, "background-color", "red") utils.append_class(node, "pp_singlecol")
def improve_table_breaks(root): # https://de.wikipedia.org/wiki/Suzy_Batkovic-Brown for table in root.xpath( '//table[not(ancestor::table) and not(contains(@class, "infobox"))]' ): rows = table.xpath("./tr|./thead/tr|./tbody/tr") for idx in range(min(len(rows), config.table_no_break_max_lines)): utils.append_class(rows[idx], "pp_nobreak_after") utils.append_class(rows[-1 * (idx + 1)], "pp_nobreak_before")
def scale_inline(root): max_inline_width = 50 max_inline_height = 50 for img in root.xpath( "//img[@width<{max_inline_width}][@height<{max_inline_height}]".format(**locals()) ): w, h = get_img_size(img) img.set("width", str(w / 2)) img.set("height", str(h / 2)) utils.append_class(img, "inline")
def map_classes(article): class_map = get_map(_class_map, article.language) if not class_map: return for node in article.dom.xpath("//*[@class]"): class_list = node.get("class").split(" ") for cls in class_map: if cls in class_list: utils.remove_class(node, cls) utils.append_class(node, class_map[cls])
def fix_galleries(root): for gallery in root.xpath('.//ul[contains(@class, "gallery")]'): for leaf in gallery.xpath(".//*"): utils.remove_node_width(leaf) utils.remove_node_height(leaf) utils.remove_node_styles(leaf, "margin") for leaf in gallery.xpath('.//li[contains(@class, "gallerybox")]'): utils.append_class(leaf, "col-4") img = leaf[0][0][0][0][0] utils.append_class(img, "thumbimage") url = img.attrib.get("src") utils.add_node_style(leaf[0][0][0], "background-image", "url({})".format(url))
def markup_maps(root): target_node = "//div[{}]" conditions = [ 'contains(@class, "thumb")', 'not(contains(@class, "thumbinner"))', 'not(contains(@class, "thumbcaption"))', 'not(contains(@class, "thumbimage"))', './/div[contains(@style, "relative") and .//div[contains(@style, "absolute")]]', ] '//div[@class="mw-parser-output"]//div[contains(@style, "relative") and .//div[contains(@style, "absolute")]]' for node in root.xpath(target_node.format(" and ".join(conditions))): utils.append_class(node, "map")
def add_class_to_infobox_wide_images(root): """ add `infobox-wide` to images wider than 100px in an infobox and remove explicit width """ for node in root.xpath('//*[contains(@class, "infobox")]//img'): if "width" in node.attrib and int(node.attrib.get("width")) > 100: utils.append_class(node, "infobox-img-wide") utils.remove_node_width(node) utils.remove_node_height(node) for td in node.xpath("./ancestor::td"): utils.append_class(td, "contains-img-wide") elif "width" in node.attrib and int(node.attrib.get("width")) <= 100: node.attrib["width"] = str(int(node.attrib["width"]) / config.px2pt)
def handle_two_col(node, width, height, reg_width, ext_width, debug): """ span node across two columns (to extended width) - if it is wider than the regular width """ if reg_width < width <= ext_width: if height > config.max_two_col_float_height: if debug: utils.add_node_style(node, "background-color", "orange") utils.append_class(node, "pp_singlecol") else: utils.append_class(node, "pp_twocol_span") if debug: utils.add_node_style(node, "background-color", "yellow")
def add_pagebreaks(root, article): if "page-break-before" in article: for xp in article["page-break-before"]: nodelist = root.xpath(xp) for node in nodelist: utils.append_class(node, "page-break-before") if "page-break-after" in article: for xp in article["page-break-after"]: nodelist = root.xpath(xp) for node in nodelist: utils.append_class(node, "page-break-after") return root
def check_size(article): for img in article.dom.xpath( '//img[not(substring(@src, string-length(@src)-3) = ".svg"' ' or substring(@src, string-length(@src)-3) = ".SVG")]' ): if not node_has_valid_image_src(img): continue path = img.get("src") if os.name == "nt": regex = "%2[fF]|%5[cC]" path = re.sub(regex, "/", path) if not os.path.exists(path): continue im = Image.open(path) width, height = im.size physical_width_in = config.px2in * float(img.get("width")) ppi = int(round(width / physical_width_in)) img.set("data-ppi", str(ppi) + "ppi") img.set("data-source-image-width", str(width) + "px") if ppi < 240: utils.append_class(img, "low-ppi")
def identify_infoboxes(root): for table in root.xpath('//table[not(contains(@class, "infobox"))]'): if any("infobox" in val.lower() for val in table.values()): utils.append_class(table, "infobox") # https://de.wikipedia.org/wiki/Das_M%C3%A4dchen_auf_dem_Meeresgrund # tables less than 3 siblings away from article start are considered infoboxes # if they are wrapped in container nodes, the containers are stripped if # no siblings are present - otherwise the table is *not* marked as an infobox path = ('//h1[@class="firstHeading"]/' "following-sibling::*[position()<3]/" 'descendant-or-self::table[not(contains(@class, "infobox"))]') for table in root.xpath(path): ancestors = [ node for node in table.iterancestors() if (node.tag != "article") ] if any(len(node.getchildren()) != 1 for node in ancestors): continue if len(ancestors): container = ancestors[-1] container.getparent().replace(container, table) utils.append_class(table, "infobox")
def add_figure_numbers(root): classes = [ "pp_singlecol", # 'infobox', # infoboxes are not referenced despite floating "pp_figure", "pp_twocol_span", ] pred = " or ".join('contains(@class, "{}")'.format(cls) for cls in classes) total_figures = 0 for article in root.xpath("//article"): figure_num = 0 for node in article.xpath(".//*[{}]".format(pred)): utils.remove_class(node, "infobox") figure_num += 1 total_figures += 1 cls = [c for c in classes if c in node.get("class")][0] nr = ".".join([article.get("pp_article_num"), str(figure_num)]) caption_txt = "Figure {nr} ".format(nr=nr) reference = E.p({"class": "pp_figure_ref"}, u"\u21AA " + caption_txt) if cls == "pp_figure": caption = node.xpath('.//*[contains(@class, "thumbcaption")]') if caption: node.addnext(reference) caption = caption[0] prefix = E.b(caption_txt) caption.insert(0, prefix) prefix.tail = caption.text caption.text = None utils.append_class(caption, "pp_figure_caption") continue wrapper = utils.wrap_node(node, "div", {"class": cls}) caption = E.div({"class": "pp_figure_caption"}, E.b(caption_txt)) wrapper.append(caption) utils.remove_class(node, cls) wrapper.addnext(reference) _combine_references(root)
def handle_table_width(node, width): """ set table width - according to "natural" size and width attribute """ if node.tag == "table": if width <= config.reg_width: utils.append_class(node, "reg-table") # tables blown up by width attributes if node.get("width") and width > config.reg_width: node.attrib.pop("width") utils.append_class(node, "wide-table") if node.getparent().tag == "div": if config.reg_width < width <= config.ext_width: utils.append_class(node.getparent(), "wide-table")
def resize_node_width_to_columns(node, width_in_pt, use_thirds_only=True): """ resizes a given node to columns by adding a col-* class """ utils.remove_node_width(node) target_col_width = next( (width for width in config.columns.values() if width > width_in_pt), 0) if target_col_width == 0: if width_in_pt <= config.tolerated_over_width: utils.wrap_node(node, "div", {"class": "over-wide-wrapper"}) utils.append_class(node, "over-wide") else: utils.append_class(node, "rotated-table") return cols = config.columns.values().index(target_col_width) + 1 if use_thirds_only: cols = int(4 * ceil(float(cols) / 4)) utils.append_class(node, "col-{}".format(cols))
def markup_short_tables(root): for my_table in root.xpath("//table"): if 0 < len(my_table.xpath("descendant::tr")) < 20: utils.append_class(my_table, "short-table")
def apply_article_options(root, options=""): if "notext" in options: article = root.find(".//article") utils.append_class(article, "nodisplay")
def markup_floated_tables(root): for my_table in root.xpath("//table"): styles = utils.get_node_style(my_table) if "float" in styles and styles["float"] == "right": utils.append_class(my_table, "right-floated-table")
def h1_add_no_top_margin(root): # add no-top-margin class to h1 in articles that immediately follow a chapter for h1 in root.xpath( '//article[@class="pp_chapter" and count(*) = 1]/following-sibling::article[1]' ): utils.append_class(h1, "no-top-margin")
def add_center_class(root): for node in root.xpath('//div[contains(@style, "text-align:center")]'): utils.append_class(node, "center")
def mark_img_container(root): """https://de.wikipedia.org/wiki/Chaoyang_%28Shantou%29""" for img_container in root.xpath('//article/div/*[self::div[contains(@class,"thumb ")]]'): utils.append_class(img_container, "pp_figure")