def handle_col_floats(root): for node in root.xpath('//*[contains(@class, "infobox")]'): w, h = node_size(node) if h < config.min_float_height or (h < 2 * config.min_float_height and node_is_floatable(node, w, h)): utils.append_class(node, "pp_no_float") elif "pp_float_table" in node.get("class", ""): utils.remove_class(node, "pp_float_table")
def map_classes(article): class_map = get_map(_class_map, article.language) if not class_map: return for node in article.dom.xpath("//*[@class]"): class_list = node.get("class").split(" ") for cls in class_map: if cls in class_list: utils.remove_class(node, cls) utils.append_class(node, class_map[cls])
def map_class_to_style(article): class_to_style_map = get_map(_class_to_style_map, article.language) if not class_to_style_map: return for css_class in class_to_style_map: style_attr, style_val = class_to_style_map[css_class] for node in article.dom.xpath( '//*[contains(@class, "{}")]'.format(css_class)): utils.add_node_style(node, style_attr, style_val) utils.remove_class(node, css_class)
def add_figure_numbers(root): classes = [ "pp_singlecol", # 'infobox', # infoboxes are not referenced despite floating "pp_figure", "pp_twocol_span", ] pred = " or ".join('contains(@class, "{}")'.format(cls) for cls in classes) total_figures = 0 for article in root.xpath("//article"): figure_num = 0 for node in article.xpath(".//*[{}]".format(pred)): utils.remove_class(node, "infobox") figure_num += 1 total_figures += 1 cls = [c for c in classes if c in node.get("class")][0] nr = ".".join([article.get("pp_article_num"), str(figure_num)]) caption_txt = "Figure {nr} ".format(nr=nr) reference = E.p({"class": "pp_figure_ref"}, u"\u21AA " + caption_txt) if cls == "pp_figure": caption = node.xpath('.//*[contains(@class, "thumbcaption")]') if caption: node.addnext(reference) caption = caption[0] prefix = E.b(caption_txt) caption.insert(0, prefix) prefix.tail = caption.text caption.text = None utils.append_class(caption, "pp_figure_caption") continue wrapper = utils.wrap_node(node, "div", {"class": cls}) caption = E.div({"class": "pp_figure_caption"}, E.b(caption_txt)) wrapper.append(caption) utils.remove_class(node, cls) wrapper.addnext(reference) _combine_references(root)
def remove_low_ppi(root): result = root.xpath('//img[contains(@class, "low-ppi")]') for img in result: utils.remove_class(img, "low-ppi")