Example #1
0
 def transform_list(self, obj):
     if obj["items"]:
         raise NotInterested()
     else:
         message = "Empty List"
         log_parser_error(message, obj)
         return {"type": "error", "message": message}
Example #2
0
 def unfold_section(self, obj, level):
     test = lambda x: x["type"] == "header" and x["depth"] == level
     headings = list(filter(test, obj["content"]))
     contents = self.split_list(test, obj["content"])
     # case 1: no underlying structure
     if not headings and len(contents) == 1:
         subsections = obj["content"]
     # case 2: no paragraph before first header
     elif len(headings) == len(contents):
         subsections = [{
             "type": "section",
             "title": h["content"],
             "depth": h["depth"],
             "content": c
         } for h, c in zip(headings, contents)]
     # case 3: one paragraph before first header
     elif len(headings) == len(contents) - 1:
         subsections = (contents[0] + [{
             "type": "section",
             "title": h["content"],
             "depth": h["depth"],
             "content": c
         } for h, c in zip(headings, contents[1:])])
     # mismatch between headers and paragraphs
     else:
         message = "ill-formed structure in article"
         subsections = [{"type": "error", "message": message}]
         log_parser_error(message, obj, position=self.current_position)
     return merge(obj, {"content": self(subsections)})
Example #3
0
        def transform_dict(self, obj):
            check(obj, "type") == "element"
            check(obj, "name") == "span"
            check(obj, "attrs", "typeof") == "mw:Image"

            message = "Inline images are not allowed"
            log_parser_error(message, obj, position=self.current_section)

            return {"type": "error", "message": message}
Example #4
0
        def check_image(self, obj):
            _, ext = os.path.splitext(obj["name"])

            # TODO: Set "noprint" in galleryitem
            if ext in (".webm", ".gif") or obj.get("noprint", False):
                return None
            elif ext in (".jpg", ".svg", ".png"):
                raise NotInterested()
            else:
                message = "Unrecognized image with extension " + ext
                log_parser_error(message, obj)
                return {"type": "error", "message": message}
Example #5
0
        def normalize(self, obj, mode):
            try:
                formula = self.api.normalize_formula(obj["formula"], mode)

                if mode == "tex":
                    formula = remove_prefix(formula, "{\\begin{aligned}")
                    formula = remove_suffix(formula, "\\end{aligned}}")
            except ValueError:
                message = "Wrong formatted formula"
                # TODO: current_section was not set for this class
                log_parser_error(message, obj)
                return {"type": "error", "message": message}

            return merge(obj, {"formula": formula})
Example #6
0
        def transform_dict(self, obj):
            check(obj, "type") == "element"
            check(obj, "name") == "dl"

            items = [{
                "type": "definitionlistitem",
                "definition": self(dt["children"]),
                "explanation": self(dd["children"])
            } for dt, dd in zip(obj["children"][::2], obj["children"][1::2])]

            if not items:
                message = "A definition list must not be empty!"
                log_parser_error(message, obj, position=self.current_section)
                return {"type": "error", "message": message}

            return {"type": "definitionlist", "items": items}
Example #7
0
        def parse_gallery_item(self, text):
            try:
                name, caption = text.split("|", 1)
            except ValueError:
                message = "Gallery item needs a caption"
                log_parser_error(message, text, position=self.current_section)
                return {"type": "error", "message": message}

            caption = parse_inline(self.api, self.title, caption.strip())
            license = self.api.get_image_license(name)

            return {
                "type": "galleryitem",
                "caption": caption,
                "name": canonical_image_name(name),
                "license": license
            }
Example #8
0
        def transform_template(self, obj):
            for bname, tname, param_names in BOXSPEC:
                if obj["name"] == tname:
                    params = {
                        k: self(obj["params"].get(v, None))
                        for k, v in param_names.items()
                    }

                    return merge(params, {"type": bname})

            if obj["name"] == "liste":
                if "liste" in obj["params"]:
                    sublist = obj["params"]["liste"][0]

                    assert sublist["type"] == "list"

                    items = sublist["items"]
                    ordered = sublist["ordered"]
                else:
                    items = [{
                        "type": "listitem",
                        "content": self(x)
                    } for x in obj["params"]["item_list"]]
                    ordered = obj["params"].get("type", "") == "ol"

                return {
                    "type": "list",
                    "items": items,
                    "ordered": ordered,
                    "spacing": obj["params"].get("abstand", None)
                }
            elif obj["name"] == "formel":
                formula = obj["params"].get("1", [])

                if len(formula) == 1 and \
                        lookup(formula, 0, "type") == "inlinemath":
                    formula = formula[0]["formula"]
                    if formula.startswith(
                            "\\begin{align}") and formula.endswith(
                                "\\end{align}"):
                        formula = remove_prefix(formula, "\\begin{align}")
                        formula = remove_suffix(formula, "\\end{align}")
                    formula = "\\begin{align}" + formula + "\\end{align}"
                    return {"type": "equation", "formula": formula}
                else:
                    message = "Wrong formatted equation"
                    details = "Equation source code must be completely contained in just one <math></math>.\n (use \\text{this is not math} macro instead)"

                    log_parser_error(message, obj, details,
                                     self.current_section)

                    return {"type": "error", "message": message}
            elif obj["name"] == "(!":
                return None
            elif obj["name"].startswith("#invoke:"):
                # Template is header or footer
                return None
            elif obj["name"] == "noprint":
                return None
            elif obj["name"] == "todo":
                message = "Todo-Message in MediaWiki code."
                details = "Check if this TODO shoud be completed for a book release."
                log_parser_error(message, obj, details, self.current_section)

                return {"type": "error", "message": message}
            else:
                message = "Parsing of template `{}`".format(obj["name"])
                log_parser_error(message, obj, position=self.current_section)

                return {
                    "type": "notimplemented",
                    "target": obj,
                    "message": message
                }
Example #9
0
        def transform_element(self, obj):
            if obj["name"] == "p":
                return {"type": "paragraph", "content": self(obj["children"])}
            elif obj["name"] == "br":
                message = "<br> not allowed"
                log_parser_error(message, obj, position=self.current_section)
                return {"type": "error", "message": message}
            elif obj["name"] == "dfn":
                return {"type": "i", "content": self(obj["children"])}
            elif obj["name"] in ("i", "b", "th", "tr", "td"):
                return {"type": obj["name"], "content": self(obj["children"])}
            elif obj["name"] in ("h2", "h3"):
                return {
                    "type": "header",
                    # Header begin with h2 in our project -> subtract 1
                    "depth": int(obj["name"][-1]) - 1,
                    "content": self(obj["children"])
                }
            elif obj["name"] == "a":
                url = obj["attrs"].get("href", "")

                if url:
                    if url.startswith("./"):
                        # TODO: The URL prefix should not be hardcoded here
                        url = "https://de.wikibooks.org/wiki/" + url[2:]

                    assert url.startswith("http://") \
                        or url.startswith("https://")

                    return {
                        "type": "href",
                        "url": url,
                        "content": self(obj["children"])
                    }
                else:
                    message = "<a> tag without `href` url"
                    log_parser_error(message,
                                     obj,
                                     position=self.current_section)

                    return {"type": "error", "message": message}
            elif obj["name"] == "del":
                return {
                    "type": "strikethrough",
                    "content": self(obj["children"])
                }
            elif obj["name"] == "blockquote":
                return {"type": "blockquote", "content": self(obj["children"])}

            elif lookup(obj, "attrs", "typeof") == "mw:Video/Thumb":
                # TODO: Proper parsing of videos
                return None
            elif lookup(obj, "attrs", "typeof") == "mw:Extension/section":
                data = json.loads(obj["attrs"]["data-mw"])

                assert data["name"] == "section"

                if "begin" in data["attrs"]:
                    return {
                        "type": "section_start",
                        "name": data["attrs"]["begin"]
                    }
                elif "end" in data["attrs"]:
                    return {
                        "type": "section_end",
                        "name": data["attrs"]["end"]
                    }
                else:
                    return {
                        "type": "error",
                        "message": "section must be either start or end."
                    }
            elif obj["name"] in ("h1", "h4", "h5", "h6"):
                message = "Heading of depth {} is not allowed".format(
                    obj["name"][1:])
                log_parser_error(message, obj, position=self.current_section)

                return {
                    "type": "error",
                    "message": message.format(int(obj["name"][-1]))
                }
            elif lookup(obj, "attrs", "typeof") == "mw:Entity":
                # TODO: Are there other entities?
                return {"type": "entity", "kind": " "}
            elif (obj["name"] == "span" and lookup(obj, "attrs", "typeof")
                  == "mw:DisplaySpace mw:Placeholder"):
                msg = "Spans with type {} are not allowed".format(
                    lookup(obj, "attrs", "typeof"))
                log_parser_error(msg, obj, position=self.current_section)
                return {"type": "error", "message": msg}
            else:
                message = "Parsing of HTML element `{}`".format(obj["name"])
                log_parser_error(message, obj, position=self.current_section)

                return {
                    "type": "notimplemented",
                    "message": message,
                    "target": obj
                }