def transform_list(self, obj): if obj["items"]: raise NotInterested() else: message = "Empty List" log_parser_error(message, obj) return {"type": "error", "message": message}
def unfold_section(self, obj, level): test = lambda x: x["type"] == "header" and x["depth"] == level headings = list(filter(test, obj["content"])) contents = self.split_list(test, obj["content"]) # case 1: no underlying structure if not headings and len(contents) == 1: subsections = obj["content"] # case 2: no paragraph before first header elif len(headings) == len(contents): subsections = [{ "type": "section", "title": h["content"], "depth": h["depth"], "content": c } for h, c in zip(headings, contents)] # case 3: one paragraph before first header elif len(headings) == len(contents) - 1: subsections = (contents[0] + [{ "type": "section", "title": h["content"], "depth": h["depth"], "content": c } for h, c in zip(headings, contents[1:])]) # mismatch between headers and paragraphs else: message = "ill-formed structure in article" subsections = [{"type": "error", "message": message}] log_parser_error(message, obj, position=self.current_position) return merge(obj, {"content": self(subsections)})
def transform_dict(self, obj): check(obj, "type") == "element" check(obj, "name") == "span" check(obj, "attrs", "typeof") == "mw:Image" message = "Inline images are not allowed" log_parser_error(message, obj, position=self.current_section) return {"type": "error", "message": message}
def check_image(self, obj): _, ext = os.path.splitext(obj["name"]) # TODO: Set "noprint" in galleryitem if ext in (".webm", ".gif") or obj.get("noprint", False): return None elif ext in (".jpg", ".svg", ".png"): raise NotInterested() else: message = "Unrecognized image with extension " + ext log_parser_error(message, obj) return {"type": "error", "message": message}
def normalize(self, obj, mode): try: formula = self.api.normalize_formula(obj["formula"], mode) if mode == "tex": formula = remove_prefix(formula, "{\\begin{aligned}") formula = remove_suffix(formula, "\\end{aligned}}") except ValueError: message = "Wrong formatted formula" # TODO: current_section was not set for this class log_parser_error(message, obj) return {"type": "error", "message": message} return merge(obj, {"formula": formula})
def transform_dict(self, obj): check(obj, "type") == "element" check(obj, "name") == "dl" items = [{ "type": "definitionlistitem", "definition": self(dt["children"]), "explanation": self(dd["children"]) } for dt, dd in zip(obj["children"][::2], obj["children"][1::2])] if not items: message = "A definition list must not be empty!" log_parser_error(message, obj, position=self.current_section) return {"type": "error", "message": message} return {"type": "definitionlist", "items": items}
def parse_gallery_item(self, text): try: name, caption = text.split("|", 1) except ValueError: message = "Gallery item needs a caption" log_parser_error(message, text, position=self.current_section) return {"type": "error", "message": message} caption = parse_inline(self.api, self.title, caption.strip()) license = self.api.get_image_license(name) return { "type": "galleryitem", "caption": caption, "name": canonical_image_name(name), "license": license }
def transform_template(self, obj): for bname, tname, param_names in BOXSPEC: if obj["name"] == tname: params = { k: self(obj["params"].get(v, None)) for k, v in param_names.items() } return merge(params, {"type": bname}) if obj["name"] == "liste": if "liste" in obj["params"]: sublist = obj["params"]["liste"][0] assert sublist["type"] == "list" items = sublist["items"] ordered = sublist["ordered"] else: items = [{ "type": "listitem", "content": self(x) } for x in obj["params"]["item_list"]] ordered = obj["params"].get("type", "") == "ol" return { "type": "list", "items": items, "ordered": ordered, "spacing": obj["params"].get("abstand", None) } elif obj["name"] == "formel": formula = obj["params"].get("1", []) if len(formula) == 1 and \ lookup(formula, 0, "type") == "inlinemath": formula = formula[0]["formula"] if formula.startswith( "\\begin{align}") and formula.endswith( "\\end{align}"): formula = remove_prefix(formula, "\\begin{align}") formula = remove_suffix(formula, "\\end{align}") formula = "\\begin{align}" + formula + "\\end{align}" return {"type": "equation", "formula": formula} else: message = "Wrong formatted equation" details = "Equation source code must be completely contained in just one <math></math>.\n (use \\text{this is not math} macro instead)" log_parser_error(message, obj, details, self.current_section) return {"type": "error", "message": message} elif obj["name"] == "(!": return None elif obj["name"].startswith("#invoke:"): # Template is header or footer return None elif obj["name"] == "noprint": return None elif obj["name"] == "todo": message = "Todo-Message in MediaWiki code." details = "Check if this TODO shoud be completed for a book release." log_parser_error(message, obj, details, self.current_section) return {"type": "error", "message": message} else: message = "Parsing of template `{}`".format(obj["name"]) log_parser_error(message, obj, position=self.current_section) return { "type": "notimplemented", "target": obj, "message": message }
def transform_element(self, obj): if obj["name"] == "p": return {"type": "paragraph", "content": self(obj["children"])} elif obj["name"] == "br": message = "<br> not allowed" log_parser_error(message, obj, position=self.current_section) return {"type": "error", "message": message} elif obj["name"] == "dfn": return {"type": "i", "content": self(obj["children"])} elif obj["name"] in ("i", "b", "th", "tr", "td"): return {"type": obj["name"], "content": self(obj["children"])} elif obj["name"] in ("h2", "h3"): return { "type": "header", # Header begin with h2 in our project -> subtract 1 "depth": int(obj["name"][-1]) - 1, "content": self(obj["children"]) } elif obj["name"] == "a": url = obj["attrs"].get("href", "") if url: if url.startswith("./"): # TODO: The URL prefix should not be hardcoded here url = "https://de.wikibooks.org/wiki/" + url[2:] assert url.startswith("http://") \ or url.startswith("https://") return { "type": "href", "url": url, "content": self(obj["children"]) } else: message = "<a> tag without `href` url" log_parser_error(message, obj, position=self.current_section) return {"type": "error", "message": message} elif obj["name"] == "del": return { "type": "strikethrough", "content": self(obj["children"]) } elif obj["name"] == "blockquote": return {"type": "blockquote", "content": self(obj["children"])} elif lookup(obj, "attrs", "typeof") == "mw:Video/Thumb": # TODO: Proper parsing of videos return None elif lookup(obj, "attrs", "typeof") == "mw:Extension/section": data = json.loads(obj["attrs"]["data-mw"]) assert data["name"] == "section" if "begin" in data["attrs"]: return { "type": "section_start", "name": data["attrs"]["begin"] } elif "end" in data["attrs"]: return { "type": "section_end", "name": data["attrs"]["end"] } else: return { "type": "error", "message": "section must be either start or end." } elif obj["name"] in ("h1", "h4", "h5", "h6"): message = "Heading of depth {} is not allowed".format( obj["name"][1:]) log_parser_error(message, obj, position=self.current_section) return { "type": "error", "message": message.format(int(obj["name"][-1])) } elif lookup(obj, "attrs", "typeof") == "mw:Entity": # TODO: Are there other entities? return {"type": "entity", "kind": " "} elif (obj["name"] == "span" and lookup(obj, "attrs", "typeof") == "mw:DisplaySpace mw:Placeholder"): msg = "Spans with type {} are not allowed".format( lookup(obj, "attrs", "typeof")) log_parser_error(msg, obj, position=self.current_section) return {"type": "error", "message": msg} else: message = "Parsing of HTML element `{}`".format(obj["name"]) log_parser_error(message, obj, position=self.current_section) return { "type": "notimplemented", "message": message, "target": obj }