def fix_image_tables(root): img_tables = root.xpath( '//table[contains(@class, "short-table") and not(contains(@class, "infobox")) and .//a[contains(@class, "image")]]' ) for table in img_tables: utils.remove_node_styles(table, "margin") utils.append_class(table, "image-table") max_widths = {} for row in table.xpath(".//tr"): for n, column in enumerate(row.xpath(".//td")): for img in column.xpath(".//img"): width = utils.get_node_width(img, target_unit="px") max_widths[n] = max(width, max_widths.get(n, 0)) total_width = sum(max_widths.values()) if total_width * config.px2pt > config.page_width_pt: utils.append_class(table, "wide-image-table") for row in table.xpath(".//tr"): for n, column in enumerate(row.xpath(".//td")): _remove_inner_image_node_width(column, "image") utils.remove_node_styles(column, ["padding-left", "padding", "margin"]) utils.add_node_style( column, "width", "{}%".format(max_widths.get(n, 0) / total_width * 100) ) elif total_width > 0: for img in table.xpath(".//img"): _resize_image_node_width_to_pt(img)
def remove_img_style_size(root): """ add class to img container and remove explicit width attributes """ xpath_conditions = [ 'contains(@class,"thumb") ', 'and not(contains(@class, "tmulti"))', 'and not(contains(@class, "thumbinner"))', 'and not(contains(@class, "thumbcaption"))', 'and not(contains(@class, "thumbimage"))', ] result = root.xpath("//div[{}]".format(" ".join(xpath_conditions))) for img_container in result: if "map" in img_container.attrib.get("class", ""): continue thumbinner = img_container.xpath('.//*[contains(@class,"thumbinner")]') for node in thumbinner: utils.remove_node_styles(node, ["width", "height", "max-width"]) if not img_container.xpath(".//img"): log.debug("No <img> found in {}".format(etree.tostring(img_container))) continue img = img_container.xpath(".//img")[0] width = utils.get_node_width(img, target_unit="pt") utils.remove_node_styles(img, ["width", "height"]) cols = int(round(width / (column_width_pt * 4))) if cols > 3: cols = 3 cols = cols * 4 utils.append_class(img_container, "col-{}".format(cols)) utils.remove_node_width(img_container) utils.remove_node_width(img)
def remove_style_sizes(root): for table in root.xpath("//table[@style]"): utils.remove_node_styles(table, ["width", "height"]) utils.remove_node_width(table) if table.attrib.get("border"): del table.attrib["border"] utils.append_class(table, "pp_border_table")
def clean_infobox_padding(root): for node in root.xpath( '//*[contains(@class, "infobox")]//*[(self::div or self::td or self::th) and @style]' ): if "padding" in node.attrib["style"]: utils.remove_node_styles( node, ["padding", "padding-left", "padding-right", "padding-top", "padding-bottom",], )
def _resize_image_node_width_to_pt(node): """ resize images from px to pt: 96px -> 72pt = shrink to 75% the scale factor is more or less deliberate but looks decent in sample pages """ if node.tag != "img": return width = utils.get_node_width(node, target_unit="px") utils.remove_node_styles(node, ["width", "height"]) utils.remove_node_width(node) utils.add_node_style(node, "width", "{}px".format(width * config.px2pt))
def fix_galleries(root): for gallery in root.xpath('.//ul[contains(@class, "gallery")]'): for leaf in gallery.xpath(".//*"): utils.remove_node_width(leaf) utils.remove_node_height(leaf) utils.remove_node_styles(leaf, "margin") for leaf in gallery.xpath('.//li[contains(@class, "gallerybox")]'): utils.append_class(leaf, "col-4") img = leaf[0][0][0][0][0] utils.append_class(img, "thumbimage") url = img.attrib.get("src") utils.add_node_style(leaf[0][0][0], "background-image", "url({})".format(url))
def fix_img_style_size_tmulti(root): """ replace explicit width attributes with col-* classes and percentages """ xpath_conditions = [ 'contains(@class,"thumb") ', 'and contains(@class, "tmulti")', 'and not(contains(@class, "thumbinner"))', 'and not(contains(@class, "thumbcaption"))', 'and not(contains(@class, "thumbimage"))', ] result = root.xpath("//div[{}]".format(" ".join(xpath_conditions))) for img_container in result: thumbinner = img_container.xpath('.//*[contains(@class, "thumbinner")]')[0] total_width = utils.get_node_size(thumbinner, attr="max-width", target_unit="pt") utils.remove_node_styles(thumbinner, "max-width") resize_node_width_to_columns(img_container, total_width) for tsingle in thumbinner.xpath('.//*[contains(@class, "tsingle")]'): width = _remove_inner_image_node_width(tsingle, inner_class="thumbimage") single_width = width / total_width * 100 utils.add_node_style(tsingle, "width", "{}%".format(single_width))
def remove_styles(root): styles = [ "-moz-column-count", # https://de.wikipedia.org/wiki/Decatur_County_%28Indiana%29 "column-count", # https://de.wikipedia.org/wiki/Decatur_County_%28Indiana%29 "font", "font-size", "padding", # https://en.wikipedia.org/wiki/A%26M_Records,_Inc._v._Napster,_Inc. ] _remove_styles = lambda node: utils.remove_node_styles(node, styles) predicate = " or ".join(['contains(@style, "{}")'.format(style) for style in styles]) map(_remove_styles, root.xpath("//*[{}]".format(predicate)))
def _remove_inner_image_node_width(node, inner_class="thumbinner"): """ remove explicit widths from an image node Side effect: removes the node if it doesn't contain an image! :param node: :param inner_class: "thumbinner" or "thumbimage" :return: original width of the image in pt """ utils.remove_node_styles(node, ["width", "height", "max-width"]) wrapper_nodes = node.xpath('.//*[contains(@class,"{}")]'.format(inner_class)) for wrapper_node in wrapper_nodes: utils.remove_node_styles(wrapper_node, ["width", "height", "max-width"]) if not node.xpath(".//img"): log.debug("No <img> found in {}. Removing node.".format(etree.tostring(node))) utils.remove_node(node) return 0 img = node.xpath(".//img")[0] width = utils.get_node_width(img, target_unit="pt") utils.remove_node_styles(img, ["width", "height"]) utils.remove_node_width(img) return width
def clean_infobox_inner_width(root): for node in root.xpath('//*[contains(@class, "infobox")]//div[contains(@style, "width")]'): if "width" in utils.get_node_style(node): utils.remove_node_styles(node, "width")
def clean_infobox_background_color(root): for node in root.xpath('//*[contains(@class, "infobox")]//th[contains(@style, "background")]'): utils.remove_node_styles(node, ["background-color", "background"])
def optimize_maps(root): for node in root.xpath('//div[contains(@class, "map")]'): for subnode in node.xpath('.//div[contains(@style, "border")]'): utils.remove_node_styles(subnode, "border")
def remove_p_padding(root): for node in root.xpath('//p[contains(@style, "padding")]'): utils.remove_node_styles(node, "padding")
def remove_styles(root): for my_table in root.xpath("//table"): utils.remove_node_styles(my_table, ["margin-left", "text-align"])
def remove_pullquote_margin_styles(root): for table in root.xpath('//table[contains(@class, "pullquote")]'): utils.remove_node_styles(table, "margin")